diff --git a/docs/docs/ops/functions.md b/docs/docs/ops/functions.md index bb28c77f0..25c4b50c4 100644 --- a/docs/docs/ops/functions.md +++ b/docs/docs/ops/functions.md @@ -189,24 +189,33 @@ Input data: Return: *Vector[Float32, N]*, where *N* is the dimension of the embedding vector determined by the model. -## ColPaliEmbedImage +## ColPali Functions -`ColPaliEmbedImage` embeds images using the ColPali multimodal model. +ColPali functions enable multimodal document retrieval using ColVision models. These functions support ALL models available in the [colpali-engine library](https://github.com/illuin-tech/colpali), including: -ColPali (Contextual Late-interaction over Patches) uses late interaction between image patch embeddings and text token embeddings for retrieval. +- **ColPali models** (colpali-*): PaliGemma-based, best for general document retrieval +- **ColQwen2 models** (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision +- **ColSmol models** (colsmol-*): Lightweight, good for resource-constrained environments +- Any future ColVision models supported by colpali-engine + +These models use late interaction between image patch embeddings and text token embeddings for retrieval. :::note Optional Dependency Required -This function requires the `colpali-engine` library, which is an optional dependency. Install CocoIndex with: +These functions require the `colpali-engine` library, which is an optional dependency. Install CocoIndex with: ```bash pip install 'cocoindex[colpali]' ``` ::: +### ColPaliEmbedImage + +`ColPaliEmbedImage` embeds images using ColVision multimodal models. + The spec takes the following fields: -* `model` (`str`): The ColPali model name to use (e.g., "vidore/colpali-v1.2") +* `model` (`str`): Any ColVision model name supported by colpali-engine (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0"). See the [complete list of supported models](https://github.com/illuin-tech/colpali#list-of-colvision-models). Input data: @@ -214,24 +223,15 @@ Input data: Return: *Vector[Vector[Float32, N]]*, where *N* is the hidden dimension determined by the model. This returns a multi-vector format with variable patches and fixed hidden dimension. -## ColPaliEmbedQuery +### ColPaliEmbedQuery -`ColPaliEmbedQuery` embeds text queries using the ColPali multimodal model. - -This produces query embeddings compatible with ColPali image embeddings for late interaction scoring (MaxSim). - -:::note Optional Dependency Required +`ColPaliEmbedQuery` embeds text queries using ColVision multimodal models. -This function requires the `colpali-engine` library, which is an optional dependency. Install CocoIndex with: - -```bash -pip install 'cocoindex[colpali]' -``` -::: +This produces query embeddings compatible with ColVision image embeddings for late interaction scoring (MaxSim). The spec takes the following fields: -* `model` (`str`): The ColPali model name to use (e.g., "vidore/colpali-v1.2") +* `model` (`str`): Any ColVision model name supported by colpali-engine (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0"). See the [complete list of supported models](https://github.com/illuin-tech/colpali#list-of-colvision-models). Input data: diff --git a/examples/image_search/README.md b/examples/image_search/README.md index 2de157ca2..e49a851bd 100644 --- a/examples/image_search/README.md +++ b/examples/image_search/README.md @@ -68,7 +68,21 @@ export OLLAMA_MODEL="gemma3" # Optional, for caption generation - Configure model (optional): ```sh + # All ColVision models supported by colpali-engine are available + # See https://github.com/illuin-tech/colpali#list-of-colvision-models for the complete list + + # ColPali models (colpali-*) - PaliGemma-based, best for general document retrieval export COLPALI_MODEL="vidore/colpali-v1.2" # Default model + export COLPALI_MODEL="vidore/colpali-v1.3" # Latest version + + # ColQwen2 models (colqwen-*) - Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision + export COLPALI_MODEL="vidore/colqwen2-v1.0" + export COLPALI_MODEL="vidore/colqwen2.5-v0.2" # Latest Qwen2.5 model + + # ColSmol models (colsmol-*) - Lightweight, good for resource-constrained environments + export COLPALI_MODEL="vidore/colSmol-256M" + + # Any other ColVision models from https://github.com/illuin-tech/colpali are supported ``` - Run ColPali Backend: diff --git a/examples/patient_intake_extraction/README.md b/examples/patient_intake_extraction/README.md index 60925484b..e4aa5a490 100644 --- a/examples/patient_intake_extraction/README.md +++ b/examples/patient_intake_extraction/README.md @@ -1,9 +1,9 @@ # Extract structured data from patient intake forms with LLM -[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. -This repo shows how to use LLM to extract structured data from patient intake forms with different formats - like PDF, Docx, etc. +This repo shows how to use LLM to extract structured data from patient intake forms with different formats - like PDF, Docx, etc. CocoIndex supports multiple [sources](https://cocoindex.io/docs/ops/sources) and [LLM models](https://cocoindex.io/docs/ai/llm) natively. ![Structured Data From Patient Intake Forms](https://github.com/user-attachments/assets/1f6afb69-d26d-4a08-8774-13982d6aec1e) diff --git a/python/cocoindex/functions.py b/python/cocoindex/functions.py index 97e520cdb..675e0ff73 100644 --- a/python/cocoindex/functions.py +++ b/python/cocoindex/functions.py @@ -116,19 +116,62 @@ def __call__(self, text: str) -> NDArray[np.float32]: def _get_colpali_model_and_processor(model_name: str) -> ColPaliModelInfo: """Get or load ColPali model and processor, with caching.""" try: - from colpali_engine.models import ColPali, ColPaliProcessor # type: ignore[import-untyped] + from colpali_engine.models import ( # type: ignore[import-untyped] + ColPali, + ColPaliProcessor, + ColQwen2, + ColQwen2Processor, + ColQwen2_5, + ColQwen2_5_Processor, + ColIdefics3, + ColIdefics3Processor, + ) from colpali_engine.utils.torch_utils import get_torch_device # type: ignore[import-untyped] import torch except ImportError as e: raise ImportError( - "ColPali is not available. Make sure cocoindex is installed with ColPali support." + "ColVision models are not available. Make sure cocoindex is installed with ColPali support." ) from e device = get_torch_device("auto") - model = ColPali.from_pretrained( - model_name, device_map=device, torch_dtype=torch.bfloat16 - ).eval() - processor = ColPaliProcessor.from_pretrained(model_name) + + # Manual model detection based on model name + model_name_lower = model_name.lower() + + try: + if "qwen2.5" in model_name_lower: + model = ColQwen2_5.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map=device, + ).eval() + processor = ColQwen2_5_Processor.from_pretrained(model_name) + elif "qwen2" in model_name_lower: + model = ColQwen2.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map=device, + ).eval() + processor = ColQwen2Processor.from_pretrained(model_name) + elif "colsmol" in model_name_lower or "smol" in model_name_lower: + # ColSmol models use Idefics3 architecture + model = ColIdefics3.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map=device, + ).eval() + processor = ColIdefics3Processor.from_pretrained(model_name) + else: + # Default to ColPali + model = ColPali.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map=device, + ).eval() + processor = ColPaliProcessor.from_pretrained(model_name) + + except Exception as e: + raise RuntimeError(f"Failed to load model {model_name}: {e}") # Get dimension from the actual model dimension = _detect_colpali_dimension(model, processor, device) @@ -167,17 +210,25 @@ def _detect_colpali_dimension(model: Any, processor: Any, device: Any) -> int: class ColPaliEmbedImage(op.FunctionSpec): """ - `ColPaliEmbedImage` embeds images using the ColPali multimodal model. + `ColPaliEmbedImage` embeds images using ColVision multimodal models. + + Supports ALL models available in the colpali-engine library, including: + - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval + - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision + - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments + - Any future ColVision models supported by colpali-engine - ColPali (Contextual Late-interaction over Patches) uses late interaction - between image patch embeddings and text token embeddings for retrieval. + These models use late interaction between image patch embeddings and text token + embeddings for retrieval. Args: - model: The ColPali model name to use (e.g., "vidore/colpali-v1.2") + model: Any ColVision model name supported by colpali-engine + (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0") + See https://github.com/illuin-tech/colpali for the complete list of supported models. Note: This function requires the optional colpali-engine dependency. - Install it with: pip install 'cocoindex[embeddings]' + Install it with: pip install 'cocoindex[colpali]' """ model: str @@ -189,7 +240,7 @@ class ColPaliEmbedImage(op.FunctionSpec): behavior_version=1, ) class ColPaliEmbedImageExecutor: - """Executor for ColPaliEmbedImage.""" + """Executor for ColVision image embedding (ColPali, ColQwen2, ColSmol, etc.).""" spec: ColPaliEmbedImage _model_info: ColPaliModelInfo @@ -209,7 +260,7 @@ def __call__(self, img_bytes: bytes) -> Any: import io except ImportError as e: raise ImportError( - "Required dependencies (PIL, torch) are missing for ColPali image embedding." + "Required dependencies (PIL, torch) are missing for ColVision image embedding." ) from e model = self._model_info.model @@ -235,17 +286,25 @@ def __call__(self, img_bytes: bytes) -> Any: class ColPaliEmbedQuery(op.FunctionSpec): """ - `ColPaliEmbedQuery` embeds text queries using the ColPali multimodal model. + `ColPaliEmbedQuery` embeds text queries using ColVision multimodal models. + + Supports ALL models available in the colpali-engine library, including: + - ColPali models (colpali-*): PaliGemma-based, best for general document retrieval + - ColQwen2 models (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision + - ColSmol models (colsmol-*): Lightweight, good for resource-constrained environments + - Any future ColVision models supported by colpali-engine - This produces query embeddings compatible with ColPali image embeddings + This produces query embeddings compatible with ColVision image embeddings for late interaction scoring (MaxSim). Args: - model: The ColPali model name to use (e.g., "vidore/colpali-v1.2") + model: Any ColVision model name supported by colpali-engine + (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0") + See https://github.com/illuin-tech/colpali for the complete list of supported models. Note: This function requires the optional colpali-engine dependency. - Install it with: pip install 'cocoindex[embeddings]' + Install it with: pip install 'cocoindex[colpali]' """ model: str @@ -257,7 +316,7 @@ class ColPaliEmbedQuery(op.FunctionSpec): behavior_version=1, ) class ColPaliEmbedQueryExecutor: - """Executor for ColPaliEmbedQuery.""" + """Executor for ColVision query embedding (ColPali, ColQwen2, ColSmol, etc.).""" spec: ColPaliEmbedQuery _model_info: ColPaliModelInfo @@ -275,7 +334,7 @@ def __call__(self, query: str) -> Any: import torch except ImportError as e: raise ImportError( - "Required dependencies (torch) are missing for ColPali query embedding." + "Required dependencies (torch) are missing for ColVision query embedding." ) from e model = self._model_info.model