chore: Add missing type hints in functions (#453)

hh-space-invader · joein · web-flow · commit 993dcd5f6895 · 2025-01-29T12:08:20.000+01:00
* chore: Add missing type hints in functions

* add missing import, small type refactor

---------

Co-authored-by: George Panchuk &lt;george.panchuk@qdrant.tech&gt;
diff --git a/fastembed/common/model_management.py b/fastembed/common/model_management.py
@@ -4,7 +4,7 @@
 import shutil
 import tarfile
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 
 import requests
 from huggingface_hub import snapshot_download, model_info, list_repo_tree
diff --git a/fastembed/common/types.py b/fastembed/common/types.py
@@ -1,4 +1,4 @@
-import os
+from pathlib import Path
 import sys
 from PIL import Image
 from typing import Any, Iterable, Union
@@ -9,7 +9,7 @@
     from typing_extensions import TypeAlias
 
 
-PathInput: TypeAlias = Union[str, os.PathLike]
+PathInput: TypeAlias = Union[str, Path]
 PilInput: TypeAlias = Union[Image.Image, Iterable[Image.Image]]
 ImageInput: TypeAlias = Union[PathInput, Iterable[PathInput], PilInput]
 
diff --git a/fastembed/common/utils.py b/fastembed/common/utils.py
@@ -5,20 +5,22 @@
 import unicodedata
 from pathlib import Path
 from itertools import islice
-from typing import Generator, Iterable, Optional, Union
+from typing import Iterable, Optional, TypeVar
 
 import numpy as np
 
+T = TypeVar("T")
 
-def normalize(input_array, p=2, dim=1, eps=1e-12) -> np.ndarray:
+
+def normalize(input_array: np.ndarray, p: int = 2, dim: int = 1, eps: float = 1e-12) -> np.ndarray:
     # Calculate the Lp norm along the specified dimension
     norm = np.linalg.norm(input_array, ord=p, axis=dim, keepdims=True)
     norm = np.maximum(norm, eps)  # Avoid division by zero
     normalized_array = input_array / norm
     return normalized_array
 
 
-def iter_batch(iterable: Union[Iterable, Generator], size: int) -> Iterable:
+def iter_batch(iterable: Iterable[T], size: int) -> Iterable[list[T]]:
     """
     >>> list(iter_batch([1,2,3,4,5], 3))
     [[1, 2, 3], [4, 5]]
diff --git a/fastembed/image/transform/functional.py b/fastembed/image/transform/functional.py
@@ -114,7 +114,7 @@ def resize(
     return image.resize(new_size, resample)
 
 
-def rescale(image: np.ndarray, scale: float, dtype=np.float32) -> np.ndarray:
+def rescale(image: np.ndarray, scale: float, dtype: type = np.float32) -> np.ndarray:
     return (image * scale).astype(dtype)
 
 
diff --git a/tests/test_attention_embeddings.py b/tests/test_attention_embeddings.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"])
-def test_attention_embeddings(model_name) -> None:
+def test_attention_embeddings(model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = SparseTextEmbedding(model_name=model_name)
 
@@ -71,7 +71,7 @@ def test_attention_embeddings(model_name) -> None:
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions", "Qdrant/bm25"])
-def test_parallel_processing(model_name) -> None:
+def test_parallel_processing(model_name: str) -> None:
     is_ci = os.getenv("CI")
 
     model = SparseTextEmbedding(model_name=model_name)
@@ -96,7 +96,7 @@ def test_parallel_processing(model_name) -> None:
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
-def test_multilanguage(model_name) -> None:
+def test_multilanguage(model_name: str) -> None:
     is_ci = os.getenv("CI")
 
     docs = ["Mangez-vous vraiment des grenouilles?", "Je suis au lit"]
@@ -122,7 +122,7 @@ def test_multilanguage(model_name) -> None:
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm25"])
-def test_special_characters(model_name) -> None:
+def test_special_characters(model_name: str) -> None:
     is_ci = os.getenv("CI")
 
     docs = [
@@ -145,7 +145,7 @@ def test_special_characters(model_name) -> None:
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/bm42-all-minilm-l6-v2-attentions"])
-def test_lazy_load(model_name) -> None:
+def test_lazy_load(model_name: str) -> None:
     model = SparseTextEmbedding(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
     docs = ["hello world", "flag embedding"]
diff --git a/tests/test_image_onnx_embeddings.py b/tests/test_image_onnx_embeddings.py
@@ -61,7 +61,7 @@ def test_embedding() -> None:
 
 
 @pytest.mark.parametrize("n_dims,model_name", [(512, "Qdrant/clip-ViT-B-32-vision")])
-def test_batch_embedding(n_dims, model_name) -> None:
+def test_batch_embedding(n_dims: int, model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = ImageEmbedding(model_name=model_name)
     n_images = 32
@@ -81,7 +81,7 @@ def test_batch_embedding(n_dims, model_name) -> None:
 
 
 @pytest.mark.parametrize("n_dims,model_name", [(512, "Qdrant/clip-ViT-B-32-vision")])
-def test_parallel_processing(n_dims, model_name) -> None:
+def test_parallel_processing(n_dims: int, model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = ImageEmbedding(model_name=model_name)
 
@@ -109,7 +109,7 @@ def test_parallel_processing(n_dims, model_name) -> None:
 
 
 @pytest.mark.parametrize("model_name", ["Qdrant/clip-ViT-B-32-vision"])
-def test_lazy_load(model_name) -> None:
+def test_lazy_load(model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = ImageEmbedding(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
diff --git a/tests/test_late_interaction_embeddings.py b/tests/test_late_interaction_embeddings.py
@@ -226,7 +226,7 @@ def test_parallel_processing():
     "model_name",
     ["colbert-ir/colbertv2.0"],
 )
-def test_lazy_load(model_name):
+def test_lazy_load(model_name: str):
     is_ci = os.getenv("CI")
 
     model = LateInteractionTextEmbedding(model_name=model_name, lazy_load=True)
diff --git a/tests/test_multi_gpu.py b/tests/test_multi_gpu.py
@@ -1,4 +1,5 @@
 import pytest
+from typing import Optional
 from fastembed import (
     TextEmbedding,
     SparseTextEmbedding,
@@ -13,7 +14,7 @@
 
 @pytest.mark.skip(reason="Requires a multi-gpu server")
 @pytest.mark.parametrize("device_id", [None, 0, 1])
-def test_gpu_via_providers(device_id) -> None:
+def test_gpu_via_providers(device_id: Optional[int]) -> None:
     docs = ["hello world", "flag embedding"]
 
     device_id = device_id if device_id is not None else 0
@@ -85,7 +86,7 @@ def test_gpu_via_providers(device_id) -> None:
 
 @pytest.mark.skip(reason="Requires a multi-gpu server")
 @pytest.mark.parametrize("device_ids", [None, [0], [1], [0, 1]])
-def test_gpu_cuda_device_ids(device_ids) -> None:
+def test_gpu_cuda_device_ids(device_ids: Optional[list[int]]) -> None:
     docs = ["hello world", "flag embedding"]
     device_id = device_ids[0] if device_ids else 0
     embedding_model = TextEmbedding(
@@ -170,7 +171,7 @@ def test_gpu_cuda_device_ids(device_ids) -> None:
 @pytest.mark.parametrize(
     "device_ids,parallel", [(None, None), (None, 2), ([1], None), ([1], 1), ([1], 2), ([0, 1], 2)]
 )
-def test_multi_gpu_parallel_inference(device_ids, parallel) -> None:
+def test_multi_gpu_parallel_inference(device_ids: Optional[list[int]], parallel: int) -> None:
     docs = ["hello world", "flag embedding"] * 100
     batch_size = 5
 
diff --git a/tests/test_sparse_embeddings.py b/tests/test_sparse_embeddings.py
@@ -119,7 +119,7 @@ def bm25_instance() -> None:
         delete_model_cache(model._model_dir)
 
 
-def test_stem_with_stopwords_and_punctuation(bm25_instance) -> None:
+def test_stem_with_stopwords_and_punctuation(bm25_instance: Bm25) -> None:
     # Setup
     bm25_instance.stopwords = {"the", "is", "a"}
     bm25_instance.punctuation = {".", ",", "!"}
@@ -135,7 +135,7 @@ def test_stem_with_stopwords_and_punctuation(bm25_instance) -> None:
     assert result == expected, f"Expected {expected}, but got {result}"
 
 
-def test_stem_case_insensitive_stopwords(bm25_instance) -> None:
+def test_stem_case_insensitive_stopwords(bm25_instance: Bm25) -> None:
     # Setup
     bm25_instance.stopwords = {"the", "is", "a"}
     bm25_instance.punctuation = {".", ",", "!"}
@@ -152,7 +152,7 @@ def test_stem_case_insensitive_stopwords(bm25_instance) -> None:
 
 
 @pytest.mark.parametrize("disable_stemmer", [True, False])
-def test_disable_stemmer_behavior(disable_stemmer) -> None:
+def test_disable_stemmer_behavior(disable_stemmer: bool) -> None:
     # Setup
     model = Bm25("Qdrant/bm25", language="english", disable_stemmer=disable_stemmer)
     model.stopwords = {"the", "is", "a"}
@@ -176,7 +176,7 @@ def test_disable_stemmer_behavior(disable_stemmer) -> None:
     "model_name",
     ["prithivida/Splade_PP_en_v1"],
 )
-def test_lazy_load(model_name) -> None:
+def test_lazy_load(model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = SparseTextEmbedding(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
diff --git a/tests/test_text_cross_encoder.py b/tests/test_text_cross_encoder.py
@@ -26,7 +26,7 @@
     "model_name",
     [model_name for model_name in CANONICAL_SCORE_VALUES],
 )
-def test_rerank(model_name) -> None:
+def test_rerank(model_name: str) -> None:
     is_ci = os.getenv("CI")
 
     model = TextCrossEncoder(model_name=model_name)
@@ -53,7 +53,7 @@ def test_rerank(model_name) -> None:
     "model_name",
     [model_name for model_name in SELECTED_MODELS.values()],
 )
-def test_batch_rerank(model_name) -> None:
+def test_batch_rerank(model_name: str) -> None:
     is_ci = os.getenv("CI")
 
     model = TextCrossEncoder(model_name=model_name)
@@ -82,7 +82,7 @@ def test_batch_rerank(model_name) -> None:
     "model_name",
     ["Xenova/ms-marco-MiniLM-L-6-v2"],
 )
-def test_lazy_load(model_name) -> None:
+def test_lazy_load(model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = TextCrossEncoder(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
@@ -99,7 +99,7 @@ def test_lazy_load(model_name) -> None:
     "model_name",
     [model_name for model_name in SELECTED_MODELS.values()],
 )
-def test_rerank_pairs_parallel(model_name) -> None:
+def test_rerank_pairs_parallel(model_name: str) -> None:
     is_ci = os.getenv("CI")
 
     model = TextCrossEncoder(model_name=model_name)
diff --git a/tests/test_text_multitask_embeddings.py b/tests/test_text_multitask_embeddings.py
@@ -241,7 +241,7 @@ def test_task_assignment():
     "model_name",
     ["jinaai/jina-embeddings-v3"],
 )
-def test_lazy_load(model_name):
+def test_lazy_load(model_name: str):
     is_ci = os.getenv("CI")
     model = TextEmbedding(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py
@@ -104,7 +104,7 @@ def test_embedding() -> None:
     "n_dims,model_name",
     [(384, "BAAI/bge-small-en-v1.5"), (768, "jinaai/jina-embeddings-v2-base-en")],
 )
-def test_batch_embedding(n_dims, model_name) -> None:
+def test_batch_embedding(n_dims: int, model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = TextEmbedding(model_name=model_name)
 
@@ -121,7 +121,7 @@ def test_batch_embedding(n_dims, model_name) -> None:
     "n_dims,model_name",
     [(384, "BAAI/bge-small-en-v1.5"), (768, "jinaai/jina-embeddings-v2-base-en")],
 )
-def test_parallel_processing(n_dims, model_name) -> None:
+def test_parallel_processing(n_dims: int, model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = TextEmbedding(model_name=model_name)
 
@@ -147,7 +147,7 @@ def test_parallel_processing(n_dims, model_name) -> None:
     "model_name",
     ["BAAI/bge-small-en-v1.5"],
 )
-def test_lazy_load(model_name) -> None:
+def test_lazy_load(model_name: str) -> None:
     is_ci = os.getenv("CI")
     model = TextEmbedding(model_name=model_name, lazy_load=True)
     assert not hasattr(model.model, "model")
diff --git a/tests/utils.py b/tests/utils.py
@@ -2,7 +2,8 @@
 import traceback
 
 from pathlib import Path
-from typing import Union
+from types import TracebackType
+from typing import Union, Callable, Any, Type
 
 
 def delete_model_cache(model_dir: Union[str, Path]) -> None:
@@ -16,7 +17,11 @@ def delete_model_cache(model_dir: Union[str, Path]) -> None:
         model_dir (Union[str, Path]): The path to the model cache directory.
     """
 
-    def on_error(func, path, exc_info) -> None:
+    def on_error(
+        func: Callable[..., Any],
+        path: str,
+        exc_info: tuple[Type[BaseException], BaseException, TracebackType],
+    ) -> None:
         print("Failed to remove: ", path)
         print("Exception: ", exc_info)
         traceback.print_exception(*exc_info)

Original file line number	Diff line number	Diff line change
`@@ -226,7 +226,7 @@ def test_parallel_processing():`
`226`	`226`	`"model_name",`
`227`	`227`	`["colbert-ir/colbertv2.0"],`
`228`	`228`	`)`
`229`		`-def test_lazy_load(model_name):`
	`229`	`+def test_lazy_load(model_name: str):`
`230`	`230`	`is_ci = os.getenv("CI")`
`231`	`231`
`232`	`232`	`model = LateInteractionTextEmbedding(model_name=model_name, lazy_load=True)`