From c7a0a385e604990e72f3500434d75708cc83ad6a Mon Sep 17 00:00:00 2001 From: Dev Kumar Pal <72178142+devkumar2313@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:37:28 +0530 Subject: [PATCH 1/2] Update main.py --- examples/custom_output_files/main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/custom_output_files/main.py b/examples/custom_output_files/main.py index 5bbfa83db..73cff3455 100644 --- a/examples/custom_output_files/main.py +++ b/examples/custom_output_files/main.py @@ -1,6 +1,7 @@ from datetime import timedelta import os import dataclasses +from typing import Optional, Dict, Tuple, Any import cocoindex from markdown_it import MarkdownIt @@ -8,6 +9,7 @@ _markdown_it = MarkdownIt("gfm-like") +@dataclasses.dataclass class LocalFileTarget(cocoindex.op.TargetSpec): """Represents the custom target spec.""" @@ -36,7 +38,9 @@ def describe(key: str) -> str: @staticmethod def apply_setup_change( - key: str, previous: LocalFileTarget | None, current: LocalFileTarget | None + key: str, + previous: Optional[LocalFileTarget], + current: Optional[LocalFileTarget] ) -> None: """ Apply setup changes to the target. @@ -68,7 +72,7 @@ def prepare(spec: LocalFileTarget) -> LocalFileTarget: @staticmethod def mutate( - *all_mutations: tuple[LocalFileTarget, dict[str, LocalFileTargetValues | None]], + *all_mutations: Tuple[LocalFileTarget, Dict[str, Optional[LocalFileTargetValues]]], ) -> None: """ Mutate the target. @@ -90,7 +94,7 @@ def mutate( except FileNotFoundError: pass else: - with open(full_path, "w") as f: + with open(full_path, "w", encoding="utf-8") as f: f.write(mutation.html) From d41a7b4927e6f36be7007c8b46375105ad29297d Mon Sep 17 00:00:00 2001 From: Dev Kumar Pal <72178142+devkumar2313@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:38:13 +0530 Subject: [PATCH 2/2] Update main.py --- examples/multi_format_indexing/main.py | 39 +++++++++++--------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/examples/multi_format_indexing/main.py b/examples/multi_format_indexing/main.py index aab794e1a..752bdc3cc 100644 --- a/examples/multi_format_indexing/main.py +++ b/examples/multi_format_indexing/main.py @@ -1,6 +1,7 @@ import cocoindex import os import mimetypes +from typing import List, Optional, Any, Dict from dotenv import load_dotenv from dataclasses import dataclass @@ -16,12 +17,12 @@ @dataclass class Page: - page_number: int | None + page_number: Optional[int] image: bytes @cocoindex.op.function() -def file_to_pages(filename: str, content: bytes) -> list[Page]: +def file_to_pages(filename: str, content: bytes) -> List[Page]: """ Classify file content based on MIME type detection. Returns ClassifiedFileContent with appropriate field populated based on file type. @@ -31,7 +32,7 @@ def file_to_pages(filename: str, content: bytes) -> list[Page]: if mime_type == "application/pdf": images = convert_from_bytes(content, dpi=300) - pages = [] + pages: List[Page] = [] for i, image in enumerate(images): with BytesIO() as buffer: image.save(buffer, format="PNG") @@ -54,46 +55,38 @@ def multi_format_indexing_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope ) -> None: """ - Define an example flow that embeds files into a vector database. + Define an example flow that extracts manual information from a Markdown. """ data_scope["documents"] = flow_builder.add_source( - cocoindex.sources.LocalFile(path="source_files", binary=True) + cocoindex.sources.LocalFile(path="data", binary=True) ) - output_embeddings = data_scope.add_collector() + embeddings_index = data_scope.add_collector() with data_scope["documents"].row() as doc: - doc["pages"] = flow_builder.transform( - file_to_pages, filename=doc["filename"], content=doc["content"] - ) + doc["pages"] = doc.transform(file_to_pages, filename=doc["filename"], content=doc["content"]) with doc["pages"].row() as page: page["embedding"] = page["image"].transform( - cocoindex.functions.ColPaliEmbedImage(model=COLPALI_MODEL_NAME) + cocoindex.functions.ColPali(model_name=COLPALI_MODEL_NAME) ) - output_embeddings.collect( - id=cocoindex.GeneratedField.UUID, + embeddings_index.collect( filename=doc["filename"], page=page["page_number"], embedding=page["embedding"], ) - output_embeddings.export( - "multi_format_indexings", + embeddings_index.export( + "output", cocoindex.targets.Qdrant( connection=qdrant_connection, collection_name=QDRANT_COLLECTION, + vector_field_name="embedding", ), - primary_key_fields=["id"], + primary_key_fields=["filename", "page"], ) -@cocoindex.transform_flow() -def query_to_colpali_embedding( - text: cocoindex.DataSlice[str], -) -> cocoindex.DataSlice[list[list[float]]]: - return text.transform( - cocoindex.functions.ColPaliEmbedQuery(model=COLPALI_MODEL_NAME) - ) +query_to_colpali_embedding = cocoindex.functions.ColPali(model_name=COLPALI_MODEL_NAME) def _main() -> None: @@ -122,7 +115,7 @@ def _main() -> None: payload = result.payload if payload is None: continue - page_number = payload["page"] + page_number: Optional[int] = payload.get("page") page_number_str = f"Page:{page_number}" if page_number is not None else "" print(f"[{score:.3f}] {payload['filename']} {page_number_str}") print("---")