Skip to content

Commit d41a7b4

Browse files
authored
Update main.py
1 parent c7a0a38 commit d41a7b4

File tree

1 file changed

+16
-23
lines changed
  • examples/multi_format_indexing

1 file changed

+16
-23
lines changed

examples/multi_format_indexing/main.py

Lines changed: 16 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import cocoindex
22
import os
33
import mimetypes
4+
from typing import List, Optional, Any, Dict
45

56
from dotenv import load_dotenv
67
from dataclasses import dataclass
@@ -16,12 +17,12 @@
1617

1718
@dataclass
1819
class Page:
19-
page_number: int | None
20+
page_number: Optional[int]
2021
image: bytes
2122

2223

2324
@cocoindex.op.function()
24-
def file_to_pages(filename: str, content: bytes) -> list[Page]:
25+
def file_to_pages(filename: str, content: bytes) -> List[Page]:
2526
"""
2627
Classify file content based on MIME type detection.
2728
Returns ClassifiedFileContent with appropriate field populated based on file type.
@@ -31,7 +32,7 @@ def file_to_pages(filename: str, content: bytes) -> list[Page]:
3132

3233
if mime_type == "application/pdf":
3334
images = convert_from_bytes(content, dpi=300)
34-
pages = []
35+
pages: List[Page] = []
3536
for i, image in enumerate(images):
3637
with BytesIO() as buffer:
3738
image.save(buffer, format="PNG")
@@ -54,46 +55,38 @@ def multi_format_indexing_flow(
5455
flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
5556
) -> None:
5657
"""
57-
Define an example flow that embeds files into a vector database.
58+
Define an example flow that extracts manual information from a Markdown.
5859
"""
5960
data_scope["documents"] = flow_builder.add_source(
60-
cocoindex.sources.LocalFile(path="source_files", binary=True)
61+
cocoindex.sources.LocalFile(path="data", binary=True)
6162
)
6263

63-
output_embeddings = data_scope.add_collector()
64+
embeddings_index = data_scope.add_collector()
6465

6566
with data_scope["documents"].row() as doc:
66-
doc["pages"] = flow_builder.transform(
67-
file_to_pages, filename=doc["filename"], content=doc["content"]
68-
)
67+
doc["pages"] = doc.transform(file_to_pages, filename=doc["filename"], content=doc["content"])
6968
with doc["pages"].row() as page:
7069
page["embedding"] = page["image"].transform(
71-
cocoindex.functions.ColPaliEmbedImage(model=COLPALI_MODEL_NAME)
70+
cocoindex.functions.ColPali(model_name=COLPALI_MODEL_NAME)
7271
)
73-
output_embeddings.collect(
74-
id=cocoindex.GeneratedField.UUID,
72+
embeddings_index.collect(
7573
filename=doc["filename"],
7674
page=page["page_number"],
7775
embedding=page["embedding"],
7876
)
7977

80-
output_embeddings.export(
81-
"multi_format_indexings",
78+
embeddings_index.export(
79+
"output",
8280
cocoindex.targets.Qdrant(
8381
connection=qdrant_connection,
8482
collection_name=QDRANT_COLLECTION,
83+
vector_field_name="embedding",
8584
),
86-
primary_key_fields=["id"],
85+
primary_key_fields=["filename", "page"],
8786
)
8887

8988

90-
@cocoindex.transform_flow()
91-
def query_to_colpali_embedding(
92-
text: cocoindex.DataSlice[str],
93-
) -> cocoindex.DataSlice[list[list[float]]]:
94-
return text.transform(
95-
cocoindex.functions.ColPaliEmbedQuery(model=COLPALI_MODEL_NAME)
96-
)
89+
query_to_colpali_embedding = cocoindex.functions.ColPali(model_name=COLPALI_MODEL_NAME)
9790

9891

9992
def _main() -> None:
@@ -122,7 +115,7 @@ def _main() -> None:
122115
payload = result.payload
123116
if payload is None:
124117
continue
125-
page_number = payload["page"]
118+
page_number: Optional[int] = payload.get("page")
126119
page_number_str = f"Page:{page_number}" if page_number is not None else ""
127120
print(f"[{score:.3f}] {payload['filename']} {page_number_str}")
128121
print("---")

0 commit comments

Comments
 (0)