qdrant
diff --git a/‎fastembed/image/onnx_embedding.py‎
Lines changed: 20 additions & 28 deletions b/‎fastembed/image/onnx_embedding.py‎
Lines changed: 20 additions & 28 deletions
diff --git a/‎fastembed/image/onnx_image_model.py‎
Lines changed: 19 additions & 0 deletions b/‎fastembed/image/onnx_image_model.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎fastembed/late_interaction/colbert.py‎
Lines changed: 10 additions & 30 deletions b/‎fastembed/late_interaction/colbert.py‎
Lines changed: 10 additions & 30 deletions
diff --git a/‎fastembed/late_interaction_multimodal/colpali.py‎
Lines changed: 17 additions & 38 deletions b/‎fastembed/late_interaction_multimodal/colpali.py‎
Lines changed: 17 additions & 38 deletions
diff --git a/‎fastembed/late_interaction_multimodal/onnx_multimodal_model.py‎
Lines changed: 25 additions & 0 deletions b/‎fastembed/late_interaction_multimodal/onnx_multimodal_model.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py‎
Lines changed: 14 additions & 19 deletions b/‎fastembed/rerank/cross_encoder/onnx_text_cross_encoder.py‎
Lines changed: 14 additions & 19 deletions
diff --git a/‎fastembed/rerank/cross_encoder/onnx_text_model.py‎
Lines changed: 20 additions & 0 deletions b/‎fastembed/rerank/cross_encoder/onnx_text_model.py‎
Lines changed: 20 additions & 0 deletions
@@ -74,25 +74,21 @@ def __init__(
         **kwargs: Any,
     ):
         """
+        Initializes an ONNX image embedding model with configurable device, threading, and loading options.
+        
         Args:
-            model_name (str): The name of the model to use.
-            cache_dir (str, optional): The path to the cache directory.
-                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
-                                       Defaults to `fastembed_cache` in the system's temp directory.
-            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
-            providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
-                Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
-            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
-            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
-                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
-            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
-            specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
-
+            model_name: Name of the ONNX model to use, in the format <org>/<model>.
+            cache_dir: Optional directory for caching model files.
+            threads: Number of threads for ONNX runtime session.
+            providers: Optional list of ONNX runtime providers to use for inference.
+            cuda: If True, enables CUDA for inference; mutually exclusive with `providers`.
+            device_ids: Optional list of device IDs for parallel processing; used with `cuda=True`.
+            lazy_load: If True, defers model loading until first use.
+            device_id: Optional device ID for model loading in the current process.
+            specific_model_path: Optional path to a specific ONNX model directory.
+        
         Raises:
-            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+            ValueError: If `model_name` is not in the required <org>/<model> format.
         """
 
         super().__init__(model_name, cache_dir, threads, **kwargs)
@@ -154,19 +150,15 @@ def embed(
         **kwargs: Any,
     ) -> Iterable[NumpyArray]:
         """
-        Encode a list of images into list of embeddings.
-        We use mean pooling with attention so that the model can handle variable-length inputs.
-
+        Generates embeddings for one or more images using the loaded ONNX model.
+        
         Args:
-            images: Iterator of image paths or single image path to embed
-            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
-            parallel:
-                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
-                If 0, use all available cores.
-                If None, don't use data-parallel processing, use default onnxruntime threading instead.
-
+            images: A single image input or an iterable of image inputs to embed.
+            batch_size: Number of images to process in each batch.
+            parallel: Number of parallel workers to use for data-parallel encoding. If 0, uses all available cores; if None, disables parallel processing.
+        
         Returns:
-            List of embeddings, one per document
+            An iterable of numpy arrays, each representing the embedding of an input image.
         """
 
         yield from self._embed_images(
 
@@ -101,6 +101,25 @@ def _embed_images(
         specific_model_path: Optional[str] = None,
         **kwargs: Any,
     ) -> Iterable[T]:
+        """
+        Embeds images using the ONNX model, processing them sequentially or in parallel.
+        
+        Depending on the input size and the `parallel` parameter, images are embedded either in batches on the main process or distributed across multiple worker processes. Supports additional configuration for model loading and caching.
+        
+        Args:
+            model_name: Name of the ONNX model to use.
+            cache_dir: Directory for model caching.
+            images: Single image or iterable of images to embed.
+            batch_size: Number of images per batch.
+            parallel: Number of parallel worker processes to use; if None or input is small, runs sequentially.
+            cuda: Whether to use CUDA-enabled devices.
+            device_ids: List of device IDs for parallel workers.
+            local_files_only: If True, restricts model loading to local files.
+            specific_model_path: Path to a specific model file to load.
+        
+        Yields:
+            Embeddings for each input image, post-processed as defined by the subclass.
+        """
         is_small = False
 
         if isinstance(images, (str, Path, Image.Image)):
 
@@ -130,25 +130,9 @@ def __init__(
         **kwargs: Any,
     ):
         """
-        Args:
-            model_name (str): The name of the model to use.
-            cache_dir (str, optional): The path to the cache directory.
-                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
-                                       Defaults to `fastembed_cache` in the system's temp directory.
-            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
-            providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
-                Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
-            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
-            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
-                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
-            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
-            specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
-
-        Raises:
-            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        Initializes a Colbert model instance for ONNX-based late interaction text embedding.
+        
+        Configures model loading, device selection, threading, and caching options. Optionally supports lazy loading and specifying a custom ONNX model path. Raises a ValueError if the model name format is invalid.
         """
 
         super().__init__(model_name, cache_dir, threads, **kwargs)
@@ -211,19 +195,15 @@ def embed(
         **kwargs: Any,
     ) -> Iterable[NumpyArray]:
         """
-        Encode a list of documents into list of embeddings.
-        We use mean pooling with attention so that the model can handle variable-length inputs.
-
+        Generates embeddings for one or more documents using mean pooling with attention.
+        
         Args:
-            documents: Iterator of documents or single document to embed
-            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
-            parallel:
-                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
-                If 0, use all available cores.
-                If None, don't use data-parallel processing, use default onnxruntime threading instead.
-
+            documents: A single document or an iterable of documents to embed.
+            batch_size: Number of documents to process per batch.
+            parallel: Number of parallel workers to use for data-parallel encoding. If 0, uses all available cores. If None, uses default threading.
+        
         Returns:
-            List of embeddings, one per document
+            An iterable of embeddings, one per input document.
         """
         yield from self._embed_documents(
             model_name=self.model_name,
 
@@ -57,24 +57,9 @@ def __init__(
         **kwargs: Any,
     ):
         """
-        Args:
-            model_name (str): The name of the model to use.
-            cache_dir (str, optional): The path to the cache directory.
-                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
-                                       Defaults to `fastembed_cache` in the system's temp directory.
-            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
-            providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
-                Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
-            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
-            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
-                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
-            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
-
-        Raises:
-            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        Initializes the ColPali multimodal embedding model with specified configuration.
+        
+        Configures model loading, device and threading options, ONNX runtime providers, and cache directory. Supports lazy loading, CUDA acceleration, and custom model paths. Raises a ValueError if the model name format is invalid.
         """
 
         super().__init__(model_name, cache_dir, threads, **kwargs)
@@ -214,18 +199,15 @@ def embed_text(
         **kwargs: Any,
     ) -> Iterable[NumpyArray]:
         """
-        Encode a list of documents into list of embeddings.
-
+        Generates embeddings for one or more text documents.
+        
         Args:
-            documents: Iterator of documents or single document to embed
-            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
-            parallel:
-                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
-                If 0, use all available cores.
-                If None, don't use data-parallel processing, use default onnxruntime threading instead.
-
+            documents: A string or iterable of strings representing the documents to embed.
+            batch_size: Number of documents to process per batch.
+            parallel: Number of parallel workers to use for encoding. If 0, uses all available cores; if None, disables parallelism.
+        
         Returns:
-            List of embeddings, one per document
+            An iterable of NumPy arrays, each representing the embedding of a document.
         """
         yield from self._embed_documents(
             model_name=self.model_name,
@@ -249,18 +231,15 @@ def embed_image(
         **kwargs: Any,
     ) -> Iterable[NumpyArray]:
         """
-        Encode a list of images into list of embeddings.
-
+        Generates embeddings for one or more images.
+        
         Args:
-            images: Iterator of image paths or single image path to embed
-            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
-            parallel:
-                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
-                If 0, use all available cores.
-                If None, don't use data-parallel processing, use default onnxruntime threading instead.
-
+            images: A single image input or an iterable of image inputs to embed.
+            batch_size: Number of images to process per batch.
+            parallel: Number of parallel workers to use for encoding. If 0, uses all available cores; if None, disables parallel processing.
+        
         Returns:
-            List of embeddings, one per document
+            An iterable of NumPy arrays, each representing the embedding of an input image.
         """
         yield from self._embed_images(
             model_name=self.model_name,
 
@@ -124,6 +124,26 @@ def _embed_documents(
         specific_model_path: Optional[str] = None,
         **kwargs: Any,
     ) -> Iterable[T]:
+        """
+        Embeds a collection of text documents using the ONNX model, with optional parallel processing.
+        
+        If the input is small or parallelism is not requested, processes documents in batches on the main process. Otherwise, distributes batches across parallel worker processes. Supports additional options for local file usage and specifying a model path.
+        
+        Args:
+            model_name: Name of the ONNX model to use.
+            cache_dir: Directory for model caching.
+            documents: Single string or iterable of text documents to embed.
+            batch_size: Number of documents per batch.
+            parallel: Number of parallel worker processes to use. If None or input is small, runs in the main process.
+            providers: Optional sequence of ONNX runtime providers.
+            cuda: Whether to use CUDA-enabled devices.
+            device_ids: Optional list of device IDs for parallel workers.
+            local_files_only: If True, restricts model loading to local files.
+            specific_model_path: Optional path to a specific model file.
+        
+        Yields:
+            Embeddings for each input document, in order.
+        """
         is_small = False
 
         if isinstance(documents, str):
@@ -191,6 +211,11 @@ def _embed_images(
         specific_model_path: Optional[str] = None,
         **kwargs: Any,
     ) -> Iterable[T]:
+        """
+        Embeds images using the ONNX model, with optional parallel processing.
+        
+        Processes a collection of images in batches, either sequentially or in parallel using worker processes. Supports loading models from local files only or a specific model path if specified. Yields post-processed embeddings for each image.
+        """
         is_small = False
 
         if isinstance(images, (str, Path, Image.Image)):
 
@@ -88,25 +88,9 @@ def __init__(
         **kwargs: Any,
     ):
         """
-        Args:
-            model_name (str): The name of the model to use.
-            cache_dir (str, optional): The path to the cache directory.
-                                       Can be set using the `FASTEMBED_CACHE_PATH` env variable.
-                                       Defaults to `fastembed_cache` in the system's temp directory.
-            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
-            providers (Optional[Sequence[OnnxProvider]], optional): The list of onnxruntime providers to use.
-                Mutually exclusive with the `cuda` and `device_ids` arguments. Defaults to None.
-            cuda (bool, optional): Whether to use cuda for inference. Mutually exclusive with `providers`
-                Defaults to False.
-            device_ids (Optional[list[int]], optional): The list of device ids to use for data parallel processing in
-                workers. Should be used with `cuda=True`, mutually exclusive with `providers`. Defaults to None.
-            lazy_load (bool, optional): Whether to load the model during class initialization or on demand.
-                Should be set to True when using multiple-gpu and parallel encoding. Defaults to False.
-            device_id (Optional[int], optional): The device id to use for loading the model in the worker process.
-            specific_model_path (Optional[str], optional): The specific path to the onnx model dir if it should be imported from somewhere else
-
-        Raises:
-            ValueError: If the model_name is not in the format <org>/<model> e.g. Xenova/ms-marco-MiniLM-L-6-v2.
+        Initializes an ONNX-based cross-encoder model for text re-ranking.
+        
+        Configures model selection, caching, threading, device assignment, ONNX runtime providers, and model loading behavior. Downloads and prepares the ONNX model for inference, with support for custom model paths and lazy loading. Raises a ValueError if the model name format is invalid.
         """
         super().__init__(model_name, cache_dir, threads, **kwargs)
         self.providers = providers
@@ -181,6 +165,17 @@ def rerank_pairs(
         parallel: Optional[int] = None,
         **kwargs: Any,
     ) -> Iterable[float]:
+        """
+        Reranks pairs of texts using the ONNX cross-encoder model.
+        
+        Args:
+            pairs: An iterable of (query, document) string tuples to be scored.
+            batch_size: Number of pairs to process in each batch. Defaults to 64.
+            parallel: Optional number of parallel workers for processing.
+        
+        Yields:
+            Relevance scores as floats for each input pair, in order.
+        """
         yield from self._rerank_pairs(
             model_name=self.model_name,
             cache_dir=str(self.cache_dir),
 
@@ -98,6 +98,26 @@ def _rerank_pairs(
         specific_model_path: Optional[str] = None,
         **kwargs: Any,
     ) -> Iterable[float]:
+        """
+        Reranks a sequence of text pairs using the ONNX cross-encoder model, with optional parallel processing.
+        
+        If parallel processing is enabled and the input is large, distributes batches across multiple worker processes; otherwise, processes batches in the current process. Supports additional options for model loading, including restricting to local files and specifying a model path.
+        
+        Args:
+            model_name: Name of the ONNX model to use.
+            cache_dir: Directory for model caching.
+            pairs: Iterable of (query, document) text pairs to rerank.
+            batch_size: Number of pairs per inference batch.
+            parallel: Number of worker processes to use; if None or input is small, runs in the current process.
+            providers: Optional ONNX runtime providers.
+            cuda: Whether to use CUDA-enabled devices.
+            device_ids: Optional list of device IDs for parallel workers.
+            local_files_only: If True, restricts model loading to local files only.
+            specific_model_path: Optional path to a specific model file.
+        
+        Yields:
+            Reranked scores as floats, in the same order as the input pairs.
+        """
         is_small = False
 
         if isinstance(pairs, tuple):