Skip to content

Commit 2795d4b

Browse files
committed
Merge branch 'main' into expr-union-type-impl
2 parents 5c48526 + 68bfda9 commit 2795d4b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1116
-934
lines changed

.github/workflows/_doc_release.yml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: Release Docs
2+
3+
on:
4+
workflow_call:
5+
6+
jobs:
7+
deploy:
8+
runs-on: ubuntu-latest
9+
environment: docs-release
10+
steps:
11+
- uses: actions/checkout@v4
12+
- uses: actions/setup-node@v4
13+
with:
14+
node-version: 18
15+
cache: yarn
16+
cache-dependency-path: docs/yarn.lock
17+
- uses: webfactory/[email protected]
18+
with:
19+
ssh-private-key: ${{ secrets.GH_PAGES_DEPLOY }}
20+
- name: Deploy to GitHub Pages
21+
env:
22+
USE_SSH: true
23+
run: |
24+
export COCOINDEX_DOCS_POSTHOG_API_KEY=${{ vars.COCOINDEX_DOCS_POSTHOG_API_KEY }}
25+
export COCOINDEX_DOCS_MIXPANEL_API_KEY=${{ vars.COCOINDEX_DOCS_MIXPANEL_API_KEY }}
26+
export COCOINDEX_DOCS_ALGOLIA_APP_ID=${{ vars.COCOINDEX_DOCS_ALGOLIA_APP_ID }}
27+
export COCOINDEX_DOCS_ALGOLIA_API_KEY=${{ vars.COCOINDEX_DOCS_ALGOLIA_API_KEY }}
28+
git config --global user.email "${{ vars.COCOINDEX_DOCS_DEPLOY_USER_EMAIL }}"
29+
git config --global user.name "${{ vars.COCOINDEX_DOCS_DEPLOY_USER_NAME }}"
30+
yarn --cwd docs install --frozen-lockfile
31+
yarn --cwd docs deploy

.github/workflows/docs.yml

Lines changed: 5 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,14 @@ on:
66
paths:
77
- docs/**
88
- ".github/workflows/docs.yml"
9-
push:
10-
branches: [main]
11-
paths:
12-
- docs/**
13-
- ".github/workflows/docs.yml"
149
workflow_dispatch:
1510

1611
permissions:
1712
contents: write
1813

1914
jobs:
2015
test-deploy:
21-
if: github.event_name != 'push'
16+
if: github.event_name == 'pull_request'
2217
runs-on: ubuntu-latest
2318
steps:
2419
- uses: actions/checkout@v4
@@ -32,43 +27,8 @@ jobs:
3227
- name: Test build website
3328
run: yarn --cwd docs build
3429

35-
deploy-precheck:
36-
if: ${{ github.event_name != 'pull_request' }}
37-
runs-on: ubuntu-latest
38-
environment: docs-release
39-
outputs:
40-
gh-deploy-key: ${{ steps.gh-deploy-key.outputs.defined }}
41-
steps:
42-
- id: gh-deploy-key
43-
env:
44-
GH_PAGES_DEPLOY: ${{ secrets.GH_PAGES_DEPLOY }}
45-
if: "${{ env.GH_PAGES_DEPLOY != '' }}"
46-
run: echo "defined=true" >> $GITHUB_OUTPUT
47-
4830
deploy:
49-
needs: [deploy-precheck]
50-
if: ${{ needs.deploy-precheck.outputs.gh-deploy-key == 'true' }}
51-
runs-on: ubuntu-latest
52-
environment: docs-release
53-
steps:
54-
- uses: actions/checkout@v4
55-
- uses: actions/setup-node@v4
56-
with:
57-
node-version: 18
58-
cache: yarn
59-
cache-dependency-path: docs/yarn.lock
60-
- uses: webfactory/[email protected]
61-
with:
62-
ssh-private-key: ${{ secrets.GH_PAGES_DEPLOY }}
63-
- name: Deploy to GitHub Pages
64-
env:
65-
USE_SSH: true
66-
run: |
67-
export COCOINDEX_DOCS_POSTHOG_API_KEY=${{ vars.COCOINDEX_DOCS_POSTHOG_API_KEY }}
68-
export COCOINDEX_DOCS_MIXPANEL_API_KEY=${{ vars.COCOINDEX_DOCS_MIXPANEL_API_KEY }}
69-
export COCOINDEX_DOCS_ALGOLIA_APP_ID=${{ vars.COCOINDEX_DOCS_ALGOLIA_APP_ID }}
70-
export COCOINDEX_DOCS_ALGOLIA_API_KEY=${{ vars.COCOINDEX_DOCS_ALGOLIA_API_KEY }}
71-
git config --global user.email "${{ vars.COCOINDEX_DOCS_DEPLOY_USER_EMAIL }}"
72-
git config --global user.name "${{ vars.COCOINDEX_DOCS_DEPLOY_USER_NAME }}"
73-
yarn --cwd docs install --frozen-lockfile
74-
yarn --cwd docs deploy
31+
name: Release Docs
32+
if: ${{ github.event_name == 'workflow_dispatch' }}
33+
uses: ./.github/workflows/_doc_release.yml
34+
secrets: inherit

.github/workflows/release.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,9 @@ jobs:
107107
with:
108108
command: upload
109109
args: --non-interactive --skip-existing wheels-*/*
110+
111+
release-docs:
112+
name: Release Docs
113+
needs: [release]
114+
uses: ./.github/workflows/_doc_release.yml
115+
secrets: inherit

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ __pycache__/
99
*.so
1010

1111
# Distribution / packaging
12-
.venv/
12+
.venv*/
1313
dist/
1414

1515
.DS_Store

Cargo.lock

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ name = "cocoindex"
44
# Will be overridden for specific release versions.
55
version = "999.0.0"
66
edition = "2024"
7+
rust-version = "1.86"
78

89
[profile.release]
910
codegen-units = 1
@@ -50,6 +51,7 @@ tower-http = { version = "0.6.2", features = ["cors", "trace"] }
5051
indexmap = { version = "2.8.0", features = ["serde"] }
5152
blake2 = "0.10.6"
5253
pgvector = { version = "0.4.0", features = ["sqlx"] }
54+
phf = { version = "0.11.3", features = ["macros"] }
5355
indenter = "0.3.3"
5456
itertools = "0.14.0"
5557
derivative = "2.2.0"

docs/docs/core/data_types.mdx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,17 @@ This is the list of all basic types supported by CocoIndex:
3636
| LocalDatetime | Date and time without timezone | `cocoindex.LocalDateTime` | `datetime.datetime` |
3737
| OffsetDatetime | Date and time with a timezone offset | `cocoindex.OffsetDateTime` | `datetime.datetime` |
3838
| TimeDelta | A duration of time | `datetime.timedelta` | `datetime.timedelta` |
39-
| Vector[*T*, *Dim*?] | *T* must be basic type. *Dim* is a positive integer and optional. |`cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `list[T]` |
4039
| Json | | `cocoindex.Json` | Any data convertible to JSON by `json` package |
40+
| Vector[*T*, *Dim*?] | *T* can be a basic type or a numeric type. *Dim* is a positive integer and optional. | `cocoindex.Vector[T]` or `cocoindex.Vector[T, Dim]` | `numpy.typing.NDArray[T]` or `list[T]` |
4141

4242
Values of all data types can be represented by values in Python's native types (as described under the Native Python Type column).
4343
However, the underlying execution engine and some storage system (like Postgres) has finer distinctions for some types, specifically:
4444

4545
* *Float32* and *Float64* for `float`, with different precision.
4646
* *LocalDateTime* and *OffsetDateTime* for `datetime.datetime`, with different timezone awareness.
47-
* *Vector* has optional dimension information.
4847
* *Range* and *Json* provide a clear tag for the type, to clearly distinguish the type in CocoIndex.
48+
* *Vector* holds elements of type *T*. If *T* is numeric (e.g., `np.float32` or `np.float64`), it's represented as `NDArray[T]`; otherwise, as `list[T]`.
49+
* *Vector* also has optional dimension information.
4950

5051
The native Python type is always more permissive and can represent a superset of possible values.
5152
* Only when you annotate the return type of a custom function, you should use the specific type,

docs/docs/getting_started/quickstart.md

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,11 @@ The goal of transforming your data is usually to query against it.
154154
Once you already have your index built, you can directly access the transformed data in the target database.
155155
CocoIndex also provides utilities for you to do this more seamlessly.
156156
157-
In this example, we'll use the [`psycopg` library](https://www.psycopg.org/) to connect to the database and run queries.
158-
Please make sure it's installed:
157+
In this example, we'll use the [`psycopg` library](https://www.psycopg.org/) along with pgvector to connect to the database and run queries on vector data.
158+
Please make sure the required packages are installed:
159159

160160
```bash
161-
pip install psycopg[binary,pool]
161+
pip install numpy "psycopg[binary,pool]" pgvector
162162
```
163163

164164
### Step 4.1: Extract common transformations
@@ -169,8 +169,11 @@ i.e. they should use exactly the same embedding model and parameters.
169169
Let's extract that into a function:
170170
171171
```python title="quickstart.py"
172+
from numpy.typing import NDArray
173+
import numpy as np
174+
172175
@cocoindex.transform_flow()
173-
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
176+
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[NDArray[np.float32]]:
174177
return text.transform(
175178
cocoindex.functions.SentenceTransformerEmbed(
176179
model="sentence-transformers/all-MiniLM-L6-v2"))
@@ -207,6 +210,7 @@ Now we can create a function to query the index upon a given input query:
207210
208211
```python title="quickstart.py"
209212
from psycopg_pool import ConnectionPool
213+
from pgvector.psycopg import register_vector
210214
211215
def search(pool: ConnectionPool, query: str, top_k: int = 5):
212216
# Get the table name, for the export target in the text_embedding_flow above.
@@ -215,9 +219,10 @@ def search(pool: ConnectionPool, query: str, top_k: int = 5):
215219
query_vector = text_to_embedding.eval(query)
216220
# Run the query and get the results.
217221
with pool.connection() as conn:
222+
register_vector(conn)
218223
with conn.cursor() as cur:
219224
cur.execute(f"""
220-
SELECT filename, text, embedding <=> %s::vector AS distance
225+
SELECT filename, text, embedding <=> %s AS distance
221226
FROM {table_name} ORDER BY distance LIMIT %s
222227
""", (query_vector, top_k))
223228
return [
@@ -236,7 +241,7 @@ There're two CocoIndex-specific logic:
236241
237242
2. Evaluate the transform flow defined above with the input query, to get the embedding.
238243
It's done by the `eval()` method of the transform flow `text_to_embedding`.
239-
The return type of this method is `list[float]` as declared in the `text_to_embedding()` function (`cocoindex.DataSlice[list[float]]`).
244+
The return type of this method is `NDArray[np.float32]` as declared in the `text_to_embedding()` function (`cocoindex.DataSlice[NDArray[np.float32]]`).
240245
241246
### Step 4.3: Add the main script logic
242247

docs/docs/query.mdx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ The [quickstart](getting_started/quickstart#step-41-extract-common-transformatio
4141

4242
```python
4343
@cocoindex.transform_flow()
44-
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
44+
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[NDArray[np.float32]]:
4545
return text.transform(
4646
cocoindex.functions.SentenceTransformerEmbed(
4747
model="sentence-transformers/all-MiniLM-L6-v2"))
@@ -61,7 +61,7 @@ with doc["chunks"].row() as chunk:
6161
chunk["embedding"] = chunk["text"].call(text_to_embedding)
6262
```
6363

64-
Any time, you can call the `eval()` method with specific string, which will return a `list[float]`:
64+
Any time, you can call the `eval()` method with specific string, which will return a `NDArray[np.float32]`:
6565

6666
```python
6767
print(text_to_embedding.eval("Hello, world!"))
@@ -93,7 +93,7 @@ For example:
9393

9494
```python
9595
table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings")
96-
query = f"SELECT filename, text FROM {table_name} ORDER BY embedding <=> %s::vector DESC LIMIT 5"
96+
query = f"SELECT filename, text FROM {table_name} ORDER BY embedding <=> %s DESC LIMIT 5"
9797
...
9898
```
9999

examples/text_embedding/Text_Embedding.ipynb

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
},
4646
"outputs": [],
4747
"source": [
48-
"%pip install cocoindex python-dotenv psycopg[binary,pool]"
48+
"%pip install cocoindex numpy python-dotenv psycopg[binary,pool] pgvector"
4949
]
5050
},
5151
{
@@ -164,7 +164,10 @@
164164
"from dotenv import load_dotenv\n",
165165
"import os\n",
166166
"from psycopg_pool import ConnectionPool\n",
167-
"import cocoindex\n"
167+
"from pgvector.psycopg import register_vector\n",
168+
"import cocoindex\n",
169+
"from numpy.typing import NDArray\n",
170+
"import numpy as np\n"
168171
]
169172
},
170173
{
@@ -187,7 +190,7 @@
187190
"%%writefile -a main.py\n",
188191
"\n",
189192
"@cocoindex.transform_flow()\n",
190-
"def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:\n",
193+
"def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[NDArray[np.float32]]:\n",
191194
" \"\"\"\n",
192195
" Embed the text using a SentenceTransformer model.\n",
193196
" This is shared logic between indexing and querying.\n",
@@ -274,9 +277,10 @@
274277
" query_vector = text_to_embedding.eval(query)\n",
275278
" # Run the query and get the results.\n",
276279
" with pool.connection() as conn:\n",
280+
" register_vector(conn)\n",
277281
" with conn.cursor() as cur:\n",
278282
" cur.execute(f\"\"\"\n",
279-
" SELECT filename, text, embedding <=> %s::vector AS distance\n",
283+
" SELECT filename, text, embedding <=> %s AS distance\n",
280284
" FROM {table_name} ORDER BY distance LIMIT %s\n",
281285
" \"\"\", (query_vector, top_k))\n",
282286
" return [\n",

0 commit comments

Comments
 (0)