-
Notifications
You must be signed in to change notification settings - Fork 290
Fix mypy type errors in examples directory #1098
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,15 @@ | ||
| from datetime import timedelta | ||
| import os | ||
| import dataclasses | ||
| from typing import Optional, Dict, Tuple, Any | ||
|
|
||
| import cocoindex | ||
| from markdown_it import MarkdownIt | ||
|
|
||
| _markdown_it = MarkdownIt("gfm-like") | ||
|
|
||
|
|
||
| @dataclasses.dataclass | ||
| class LocalFileTarget(cocoindex.op.TargetSpec): | ||
| """Represents the custom target spec.""" | ||
|
|
||
|
|
@@ -36,7 +38,9 @@ def describe(key: str) -> str: | |
|
|
||
| @staticmethod | ||
| def apply_setup_change( | ||
| key: str, previous: LocalFileTarget | None, current: LocalFileTarget | None | ||
| key: str, | ||
| previous: Optional[LocalFileTarget], | ||
| current: Optional[LocalFileTarget] | ||
| ) -> None: | ||
| """ | ||
| Apply setup changes to the target. | ||
|
|
@@ -68,7 +72,7 @@ def prepare(spec: LocalFileTarget) -> LocalFileTarget: | |
|
|
||
| @staticmethod | ||
| def mutate( | ||
| *all_mutations: tuple[LocalFileTarget, dict[str, LocalFileTargetValues | None]], | ||
| *all_mutations: Tuple[LocalFileTarget, Dict[str, Optional[LocalFileTargetValues]]], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using |
||
| ) -> None: | ||
| """ | ||
| Mutate the target. | ||
|
|
@@ -90,7 +94,7 @@ def mutate( | |
| except FileNotFoundError: | ||
| pass | ||
| else: | ||
| with open(full_path, "w") as f: | ||
| with open(full_path, "w", encoding="utf-8") as f: | ||
| f.write(mutation.html) | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| import cocoindex | ||
| import os | ||
| import mimetypes | ||
| from typing import List, Optional, Any, Dict | ||
|
|
||
| from dotenv import load_dotenv | ||
| from dataclasses import dataclass | ||
|
|
@@ -16,12 +17,12 @@ | |
|
|
||
| @dataclass | ||
| class Page: | ||
| page_number: int | None | ||
| page_number: Optional[int] | ||
| image: bytes | ||
|
|
||
|
|
||
| @cocoindex.op.function() | ||
| def file_to_pages(filename: str, content: bytes) -> list[Page]: | ||
| def file_to_pages(filename: str, content: bytes) -> List[Page]: | ||
| """ | ||
| Classify file content based on MIME type detection. | ||
| Returns ClassifiedFileContent with appropriate field populated based on file type. | ||
|
|
@@ -31,7 +32,7 @@ def file_to_pages(filename: str, content: bytes) -> list[Page]: | |
|
|
||
| if mime_type == "application/pdf": | ||
| images = convert_from_bytes(content, dpi=300) | ||
| pages = [] | ||
| pages: List[Page] = [] | ||
| for i, image in enumerate(images): | ||
| with BytesIO() as buffer: | ||
| image.save(buffer, format="PNG") | ||
|
|
@@ -54,46 +55,38 @@ def multi_format_indexing_flow( | |
| flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope | ||
| ) -> None: | ||
| """ | ||
| Define an example flow that embeds files into a vector database. | ||
| Define an example flow that extracts manual information from a Markdown. | ||
| """ | ||
| data_scope["documents"] = flow_builder.add_source( | ||
| cocoindex.sources.LocalFile(path="source_files", binary=True) | ||
| cocoindex.sources.LocalFile(path="data", binary=True) | ||
| ) | ||
|
|
||
| output_embeddings = data_scope.add_collector() | ||
| embeddings_index = data_scope.add_collector() | ||
|
|
||
| with data_scope["documents"].row() as doc: | ||
| doc["pages"] = flow_builder.transform( | ||
| file_to_pages, filename=doc["filename"], content=doc["content"] | ||
| ) | ||
| doc["pages"] = doc.transform(file_to_pages, filename=doc["filename"], content=doc["content"]) | ||
| with doc["pages"].row() as page: | ||
| page["embedding"] = page["image"].transform( | ||
| cocoindex.functions.ColPaliEmbedImage(model=COLPALI_MODEL_NAME) | ||
| cocoindex.functions.ColPali(model_name=COLPALI_MODEL_NAME) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we have Is it tested? |
||
| ) | ||
| output_embeddings.collect( | ||
| id=cocoindex.GeneratedField.UUID, | ||
| embeddings_index.collect( | ||
| filename=doc["filename"], | ||
| page=page["page_number"], | ||
| embedding=page["embedding"], | ||
| ) | ||
|
|
||
| output_embeddings.export( | ||
| "multi_format_indexings", | ||
| embeddings_index.export( | ||
| "output", | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should avoid updating target name. It'll force existing users rebuild the index. |
||
| cocoindex.targets.Qdrant( | ||
| connection=qdrant_connection, | ||
| collection_name=QDRANT_COLLECTION, | ||
| vector_field_name="embedding", | ||
| ), | ||
| primary_key_fields=["id"], | ||
| primary_key_fields=["filename", "page"], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we changing key fields here? |
||
| ) | ||
|
|
||
|
|
||
| @cocoindex.transform_flow() | ||
| def query_to_colpali_embedding( | ||
| text: cocoindex.DataSlice[str], | ||
| ) -> cocoindex.DataSlice[list[list[float]]]: | ||
| return text.transform( | ||
| cocoindex.functions.ColPaliEmbedQuery(model=COLPALI_MODEL_NAME) | ||
| ) | ||
| query_to_colpali_embedding = cocoindex.functions.ColPali(model_name=COLPALI_MODEL_NAME) | ||
|
|
||
|
|
||
| def _main() -> None: | ||
|
|
@@ -122,7 +115,7 @@ def _main() -> None: | |
| payload = result.payload | ||
| if payload is None: | ||
| continue | ||
| page_number = payload["page"] | ||
| page_number: Optional[int] = payload.get("page") | ||
| page_number_str = f"Page:{page_number}" if page_number is not None else "" | ||
| print(f"[{score:.3f}] {payload['filename']} {page_number_str}") | ||
| print("---") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This shouldn't be needed