Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,21 @@ permissions:
contents: read

jobs:
format-check:
name: Check Python formatting
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.11
- name: Install Ruff
run: |
pip install ruff
- name: Check Python formatting
run: |
ruff format --check .

test:
name: Run test
uses: ./.github/workflows/_test.yml
4 changes: 3 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@
"cocoindex",
"reindexing",
"timedelta"
]
],
"editor.formatOnSave": true,
"python.formatting.provider": "ruff"
}
43 changes: 33 additions & 10 deletions examples/amazon_s3_embedding/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import cocoindex
import os


@cocoindex.flow_def(name="AmazonS3TextEmbedding")
def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
def amazon_s3_text_embedding_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
):
"""
Define an example flow that embeds text from Amazon S3 into a vector database.
"""
Expand All @@ -18,21 +21,32 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop
prefix=prefix,
included_patterns=["*.md", "*.txt", "*.docx"],
binary=False,
sqs_queue_url=sqs_queue_url))
sqs_queue_url=sqs_queue_url,
)
)

doc_embeddings = data_scope.add_collector()

with data_scope["documents"].row() as doc:
doc["chunks"] = doc["content"].transform(
cocoindex.functions.SplitRecursively(),
language="markdown", chunk_size=2000, chunk_overlap=500)
language="markdown",
chunk_size=2000,
chunk_overlap=500,
)

with doc["chunks"].row() as chunk:
chunk["embedding"] = chunk["text"].transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"))
doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
text=chunk["text"], embedding=chunk["embedding"])
model="sentence-transformers/all-MiniLM-L6-v2"
)
)
doc_embeddings.collect(
filename=doc["filename"],
location=chunk["location"],
text=chunk["text"],
embedding=chunk["embedding"],
)

doc_embeddings.export(
"doc_embeddings",
Expand All @@ -41,24 +55,32 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop
vector_indexes=[
cocoindex.VectorIndexDef(
field_name="embedding",
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
)
],
)


query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
name="SemanticsSearch",
flow=amazon_s3_text_embedding_flow,
target_name="doc_embeddings",
query_transform_flow=lambda text: text.transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2")),
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
model="sentence-transformers/all-MiniLM-L6-v2"
)
),
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
)


def _main():
# Use a `FlowLiveUpdater` to keep the flow data updated.
with cocoindex.FlowLiveUpdater(amazon_s3_text_embedding_flow):
# Run queries in a loop to demonstrate the query capabilities.
while True:
query = input("Enter search query (or Enter to quit): ")
if query == '':
if query == "":
break
results, _ = query_handler.search(query, 10)
print("\nSearch results:")
Expand All @@ -68,6 +90,7 @@ def _main():
print("---")
print()


if __name__ == "__main__":
load_dotenv()
cocoindex.init()
Expand Down
57 changes: 43 additions & 14 deletions examples/code_embedding/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,59 @@
import cocoindex
import os


@cocoindex.op.function()
def extract_extension(filename: str) -> str:
"""Extract the extension of a filename."""
return os.path.splitext(filename)[1]


@cocoindex.transform_flow()
def code_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
def code_to_embedding(
text: cocoindex.DataSlice[str],
) -> cocoindex.DataSlice[list[float]]:
"""
Embed the text using a SentenceTransformer model.
"""
return text.transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"))
model="sentence-transformers/all-MiniLM-L6-v2"
)
)


@cocoindex.flow_def(name="CodeEmbedding")
def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
def code_embedding_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
):
"""
Define an example flow that embeds files into a vector database.
"""
data_scope["files"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="../..",
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
excluded_patterns=["**/.*", "target", "**/node_modules"]))
cocoindex.sources.LocalFile(
path="../..",
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
excluded_patterns=["**/.*", "target", "**/node_modules"],
)
)
code_embeddings = data_scope.add_collector()

with data_scope["files"].row() as file:
file["extension"] = file["filename"].transform(extract_extension)
file["chunks"] = file["content"].transform(
cocoindex.functions.SplitRecursively(),
language=file["extension"], chunk_size=1000, chunk_overlap=300)
language=file["extension"],
chunk_size=1000,
chunk_overlap=300,
)
with file["chunks"].row() as chunk:
chunk["embedding"] = chunk["text"].call(code_to_embedding)
code_embeddings.collect(filename=file["filename"], location=chunk["location"],
code=chunk["text"], embedding=chunk["embedding"])
code_embeddings.collect(
filename=file["filename"],
location=chunk["location"],
code=chunk["text"],
embedding=chunk["embedding"],
)

code_embeddings.export(
"code_embeddings",
Expand All @@ -45,26 +64,35 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
vector_indexes=[
cocoindex.VectorIndexDef(
field_name="embedding",
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
)
],
)


def search(pool: ConnectionPool, query: str, top_k: int = 5):
# Get the table name, for the export target in the code_embedding_flow above.
table_name = cocoindex.utils.get_target_storage_default_name(code_embedding_flow, "code_embeddings")
table_name = cocoindex.utils.get_target_storage_default_name(
code_embedding_flow, "code_embeddings"
)
# Evaluate the transform flow defined above with the input query, to get the embedding.
query_vector = code_to_embedding.eval(query)
# Run the query and get the results.
with pool.connection() as conn:
with conn.cursor() as cur:
cur.execute(f"""
cur.execute(
f"""
SELECT filename, code, embedding <=> %s::vector AS distance
FROM {table_name} ORDER BY distance LIMIT %s
""", (query_vector, top_k))
""",
(query_vector, top_k),
)
return [
{"filename": row[0], "code": row[1], "score": 1.0 - row[2]}
for row in cur.fetchall()
]


def _main():
# Make sure the flow is built and up-to-date.
stats = code_embedding_flow.update()
Expand All @@ -75,7 +103,7 @@ def _main():
# Run queries in a loop to demonstrate the query capabilities.
while True:
query = input("Enter search query (or Enter to quit): ")
if query == '':
if query == "":
break
# Run the query function with the database connection pool and the query.
results = search(pool, query)
Expand All @@ -86,6 +114,7 @@ def _main():
print("---")
print()


if __name__ == "__main__":
load_dotenv()
cocoindex.init()
Expand Down
Loading