Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions examples/custom_output_files/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from datetime import timedelta
import os
import dataclasses
from typing import Optional, Dict, Tuple, Any

import cocoindex
from markdown_it import MarkdownIt

_markdown_it = MarkdownIt("gfm-like")


@dataclasses.dataclass
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't be needed

class LocalFileTarget(cocoindex.op.TargetSpec):
"""Represents the custom target spec."""

Expand Down Expand Up @@ -36,7 +38,9 @@ def describe(key: str) -> str:

@staticmethod
def apply_setup_change(
key: str, previous: LocalFileTarget | None, current: LocalFileTarget | None
key: str,
previous: Optional[LocalFileTarget],
current: Optional[LocalFileTarget]
) -> None:
"""
Apply setup changes to the target.
Expand Down Expand Up @@ -68,7 +72,7 @@ def prepare(spec: LocalFileTarget) -> LocalFileTarget:

@staticmethod
def mutate(
*all_mutations: tuple[LocalFileTarget, dict[str, LocalFileTargetValues | None]],
*all_mutations: Tuple[LocalFileTarget, Dict[str, Optional[LocalFileTargetValues]]],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using tuple, dict and | None are the recommended way since Python 3.9. We shouldn't change these.

) -> None:
"""
Mutate the target.
Expand All @@ -90,7 +94,7 @@ def mutate(
except FileNotFoundError:
pass
else:
with open(full_path, "w") as f:
with open(full_path, "w", encoding="utf-8") as f:
f.write(mutation.html)


Expand Down
39 changes: 16 additions & 23 deletions examples/multi_format_indexing/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import cocoindex
import os
import mimetypes
from typing import List, Optional, Any, Dict

from dotenv import load_dotenv
from dataclasses import dataclass
Expand All @@ -16,12 +17,12 @@

@dataclass
class Page:
page_number: int | None
page_number: Optional[int]
image: bytes


@cocoindex.op.function()
def file_to_pages(filename: str, content: bytes) -> list[Page]:
def file_to_pages(filename: str, content: bytes) -> List[Page]:
"""
Classify file content based on MIME type detection.
Returns ClassifiedFileContent with appropriate field populated based on file type.
Expand All @@ -31,7 +32,7 @@ def file_to_pages(filename: str, content: bytes) -> list[Page]:

if mime_type == "application/pdf":
images = convert_from_bytes(content, dpi=300)
pages = []
pages: List[Page] = []
for i, image in enumerate(images):
with BytesIO() as buffer:
image.save(buffer, format="PNG")
Expand All @@ -54,46 +55,38 @@ def multi_format_indexing_flow(
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
) -> None:
"""
Define an example flow that embeds files into a vector database.
Define an example flow that extracts manual information from a Markdown.
"""
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="source_files", binary=True)
cocoindex.sources.LocalFile(path="data", binary=True)
)

output_embeddings = data_scope.add_collector()
embeddings_index = data_scope.add_collector()

with data_scope["documents"].row() as doc:
doc["pages"] = flow_builder.transform(
file_to_pages, filename=doc["filename"], content=doc["content"]
)
doc["pages"] = doc.transform(file_to_pages, filename=doc["filename"], content=doc["content"])
with doc["pages"].row() as page:
page["embedding"] = page["image"].transform(
cocoindex.functions.ColPaliEmbedImage(model=COLPALI_MODEL_NAME)
cocoindex.functions.ColPali(model_name=COLPALI_MODEL_NAME)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we have ColPali under cocoindex.functions package. It won't work.

Is it tested?

)
output_embeddings.collect(
id=cocoindex.GeneratedField.UUID,
embeddings_index.collect(
filename=doc["filename"],
page=page["page_number"],
embedding=page["embedding"],
)

output_embeddings.export(
"multi_format_indexings",
embeddings_index.export(
"output",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should avoid updating target name. It'll force existing users rebuild the index.

cocoindex.targets.Qdrant(
connection=qdrant_connection,
collection_name=QDRANT_COLLECTION,
vector_field_name="embedding",
),
primary_key_fields=["id"],
primary_key_fields=["filename", "page"],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we changing key fields here?

)


@cocoindex.transform_flow()
def query_to_colpali_embedding(
text: cocoindex.DataSlice[str],
) -> cocoindex.DataSlice[list[list[float]]]:
return text.transform(
cocoindex.functions.ColPaliEmbedQuery(model=COLPALI_MODEL_NAME)
)
query_to_colpali_embedding = cocoindex.functions.ColPali(model_name=COLPALI_MODEL_NAME)


def _main() -> None:
Expand Down Expand Up @@ -122,7 +115,7 @@ def _main() -> None:
payload = result.payload
if payload is None:
continue
page_number = payload["page"]
page_number: Optional[int] = payload.get("page")
page_number_str = f"Page:{page_number}" if page_number is not None else ""
print(f"[{score:.3f}] {payload['filename']} {page_number_str}")
print("---")
Expand Down
Loading