11import cocoindex
22import os
33import mimetypes
4+ from typing import List , Optional , Any , Dict
45
56from dotenv import load_dotenv
67from dataclasses import dataclass
1617
1718@dataclass
1819class Page :
19- page_number : int | None
20+ page_number : Optional [ int ]
2021 image : bytes
2122
2223
2324@cocoindex .op .function ()
24- def file_to_pages (filename : str , content : bytes ) -> list [Page ]:
25+ def file_to_pages (filename : str , content : bytes ) -> List [Page ]:
2526 """
2627 Classify file content based on MIME type detection.
2728 Returns ClassifiedFileContent with appropriate field populated based on file type.
@@ -31,7 +32,7 @@ def file_to_pages(filename: str, content: bytes) -> list[Page]:
3132
3233 if mime_type == "application/pdf" :
3334 images = convert_from_bytes (content , dpi = 300 )
34- pages = []
35+ pages : List [ Page ] = []
3536 for i , image in enumerate (images ):
3637 with BytesIO () as buffer :
3738 image .save (buffer , format = "PNG" )
@@ -54,46 +55,38 @@ def multi_format_indexing_flow(
5455 flow_builder : cocoindex .FlowBuilder , data_scope : cocoindex .DataScope
5556) -> None :
5657 """
57- Define an example flow that embeds files into a vector database .
58+ Define an example flow that extracts manual information from a Markdown .
5859 """
5960 data_scope ["documents" ] = flow_builder .add_source (
60- cocoindex .sources .LocalFile (path = "source_files " , binary = True )
61+ cocoindex .sources .LocalFile (path = "data " , binary = True )
6162 )
6263
63- output_embeddings = data_scope .add_collector ()
64+ embeddings_index = data_scope .add_collector ()
6465
6566 with data_scope ["documents" ].row () as doc :
66- doc ["pages" ] = flow_builder .transform (
67- file_to_pages , filename = doc ["filename" ], content = doc ["content" ]
68- )
67+ doc ["pages" ] = doc .transform (file_to_pages , filename = doc ["filename" ], content = doc ["content" ])
6968 with doc ["pages" ].row () as page :
7069 page ["embedding" ] = page ["image" ].transform (
71- cocoindex .functions .ColPaliEmbedImage ( model = COLPALI_MODEL_NAME )
70+ cocoindex .functions .ColPali ( model_name = COLPALI_MODEL_NAME )
7271 )
73- output_embeddings .collect (
74- id = cocoindex .GeneratedField .UUID ,
72+ embeddings_index .collect (
7573 filename = doc ["filename" ],
7674 page = page ["page_number" ],
7775 embedding = page ["embedding" ],
7876 )
7977
80- output_embeddings .export (
81- "multi_format_indexings " ,
78+ embeddings_index .export (
79+ "output " ,
8280 cocoindex .targets .Qdrant (
8381 connection = qdrant_connection ,
8482 collection_name = QDRANT_COLLECTION ,
83+ vector_field_name = "embedding" ,
8584 ),
86- primary_key_fields = ["id " ],
85+ primary_key_fields = ["filename" , "page " ],
8786 )
8887
8988
90- @cocoindex .transform_flow ()
91- def query_to_colpali_embedding (
92- text : cocoindex .DataSlice [str ],
93- ) -> cocoindex .DataSlice [list [list [float ]]]:
94- return text .transform (
95- cocoindex .functions .ColPaliEmbedQuery (model = COLPALI_MODEL_NAME )
96- )
89+ query_to_colpali_embedding = cocoindex .functions .ColPali (model_name = COLPALI_MODEL_NAME )
9790
9891
9992def _main () -> None :
@@ -122,7 +115,7 @@ def _main() -> None:
122115 payload = result .payload
123116 if payload is None :
124117 continue
125- page_number = payload [ "page" ]
118+ page_number : Optional [ int ] = payload . get ( "page" )
126119 page_number_str = f"Page:{ page_number } " if page_number is not None else ""
127120 print (f"[{ score :.3f} ] { payload ['filename' ]} { page_number_str } " )
128121 print ("---" )
0 commit comments