Add bioimage search utils and example notebook (#553)

NikolaosPapailiou · web-flow · commit 53c8c15beedf · 2024-10-17T19:47:44.000+03:00
Add bioimage search utils and example notebook
diff --git a/apis/python/examples/object_api/bioimg_similarity_search.ipynb b/apis/python/examples/object_api/bioimg_similarity_search.ipynb
diff --git a/apis/python/src/tiledb/vector_search/embeddings/__init__.py b/apis/python/src/tiledb/vector_search/embeddings/__init__.py
@@ -1,3 +1,4 @@
+from .huggingface_auto_image_embedding import HuggingfaceAutoImageEmbedding
 from .image_resnetv2_embedding import ImageResNetV2Embedding
 from .langchain_embedding import LangChainEmbedding
 from .object_embedding import ObjectEmbedding
@@ -11,6 +12,7 @@
     "ObjectEmbedding",
     "SomaGenePTwEmbedding",
     "ImageResNetV2Embedding",
+    "HuggingfaceAutoImageEmbedding",
     "RandomEmbedding",
     "SentenceTransformersEmbedding",
     "LangChainEmbedding",
diff --git a/apis/python/src/tiledb/vector_search/embeddings/huggingface_auto_image_embedding.py b/apis/python/src/tiledb/vector_search/embeddings/huggingface_auto_image_embedding.py
@@ -0,0 +1,76 @@
+from typing import Dict, Optional, OrderedDict
+
+import numpy as np
+
+
+class HuggingfaceAutoImageEmbedding:
+    def __init__(
+        self,
+        model_name_or_path: str,
+        dimensions: int,
+        device: Optional[str] = None,
+        cache_folder: Optional[str] = None,
+        batch_size: int = 64,
+    ):
+        self.model_name_or_path = model_name_or_path
+        self.dim_num = dimensions
+        self.device = device
+        self.cache_folder = cache_folder
+        self.batch_size = batch_size
+        self.processor = None
+        self.model = None
+
+    def init_kwargs(self) -> Dict:
+        return {
+            "model_name_or_path": self.model_name_or_path,
+            "dimensions": self.dim_num,
+            "device": self.device,
+            "cache_folder": self.cache_folder,
+            "batch_size": self.batch_size,
+        }
+
+    def dimensions(self) -> int:
+        return self.dim_num
+
+    def vector_type(self) -> np.dtype:
+        return np.float32
+
+    def load(self) -> None:
+        from transformers import AutoImageProcessor
+        from transformers import AutoModel
+
+        self.processor = AutoImageProcessor.from_pretrained(self.model_name_or_path)
+        self.model = AutoModel.from_pretrained(self.model_name_or_path)
+
+    def embed(self, objects: OrderedDict, metadata: OrderedDict) -> np.ndarray:
+        from PIL import Image
+
+        write_id = 0
+        count = 0
+        image_batch = []
+        size = len(objects["image"])
+        embeddings = np.zeros((size, self.dim_num), dtype=np.float32)
+        for image_id in range(len(objects["image"])):
+            image_batch.append(
+                Image.fromarray(
+                    np.reshape(objects["image"][image_id], objects["shape"][image_id])
+                )
+            )
+            count += 1
+            if count >= self.batch_size:
+                print(image_id)
+                inputs = self.processor(images=image_batch, return_tensors="pt")
+                batch_embeddings = (
+                    self.model(**inputs).last_hidden_state[:, 0].cpu().detach().numpy()
+                )
+                embeddings[write_id : write_id + count] = batch_embeddings
+                count = 0
+                image_batch = []
+
+        if count > 0:
+            inputs = self.processor(images=image_batch, return_tensors="pt")
+            batch_embeddings = (
+                self.model(**inputs).last_hidden_state[:, 0].cpu().detach().numpy()
+            )
+            embeddings[write_id : write_id + count] = batch_embeddings
+        return embeddings
diff --git a/apis/python/src/tiledb/vector_search/object_readers/__init__.py b/apis/python/src/tiledb/vector_search/object_readers/__init__.py
@@ -1,5 +1,4 @@
-from .bioimage_reader import BioImagePartition
-from .bioimage_reader import BioImageReader
+from .bioimage_reader import BioImageDirectoryReader
 from .directory_reader import DirectoryImageReader
 from .directory_reader import DirectoryPartition
 from .directory_reader import DirectoryReader
@@ -18,8 +17,7 @@
     "SomaAnnDataReader",
     "TileDB1DArrayPartition",
     "TileDB1DArrayReader",
-    "BioImagePartition",
-    "BioImageReader",
+    "BioImageDirectoryReader",
     "DirectoryReader",
     "DirectoryTextReader",
     "DirectoryImageReader",
diff --git a/apis/python/src/tiledb/vector_search/object_readers/bioimage_reader.py b/apis/python/src/tiledb/vector_search/object_readers/bioimage_reader.py
@@ -1,65 +1,52 @@
-from typing import Any, Dict, List, Mapping, Optional, OrderedDict, Tuple
+from typing import Dict, List, Optional, OrderedDict, Sequence, Tuple
 
 import numpy as np
 
 import tiledb
-
-# from tiledb.vector_search.object_readers import ObjectPartition, ObjectReader
+from tiledb.vector_search.object_readers.directory_reader import DirectoryImageReader
+from tiledb.vector_search.object_readers.directory_reader import DirectoryPartition
 
 MAX_IMAGE_CROPS_PER_IMAGE = 10000
 
 
-# class BioImagePartition(ObjectPartition):
-class BioImagePartition:
+class BioImageDirectoryReader(DirectoryImageReader):
     def __init__(
         self,
-        partition_id: int,
-        image_uris: List[str],
-        image_id_start: int,
-    ):
-        self.partition_id = partition_id
-        self.image_uris = image_uris
-        self.image_id_start = image_id_start
-
-    def init_kwargs(self) -> Dict:
-        return {
-            "partition_id": self.partition_id,
-            "image_uris": self.image_uris,
-            "image_id_start": self.image_id_start,
-        }
-
-    def id(self) -> int:
-        return self.partition_id
-
-
-# class BioImageReader(ObjectReader):
-class BioImageReader:
-    def __init__(
-        self,
-        uri: str,
+        search_uri: str,
+        include: str = "*",
+        exclude: Sequence[str] = ["[.]*", "*/[.]*"],
+        suffixes: Optional[Sequence[str]] = None,
+        max_files: Optional[int] = None,
         level: int = -1,
         object_crop_shape: Tuple[int, int] = None,
-        config: Optional[Mapping[str, Any]] = None,
         timestamp=None,
     ):
-        self.uri = uri
+        super().__init__(
+            search_uri=search_uri,
+            include=include,
+            exclude=exclude,
+            suffixes=suffixes,
+            max_files=max_files,
+        )
         self.level = level
         self.object_crop_shape = object_crop_shape
-        self.config = config
         self.timestamp = timestamp
         self.images = None
 
     def init_kwargs(self) -> Dict:
         return {
-            "uri": self.uri,
+            "search_uri": self.search_uri,
+            "include": self.include,
+            "exclude": self.exclude,
+            "suffixes": self.suffixes,
+            "max_files": self.max_files,
             "level": self.level,
             "object_crop_shape": self.object_crop_shape,
-            "config": self.config,
             "timestamp": self.timestamp,
         }
 
     def partition_class_name(self) -> str:
-        return "BioImagePartition"
+        return "DirectoryPartition"
 
     def metadata_array_uri(self) -> str:
         return None
@@ -76,157 +63,61 @@ def metadata_attributes(self) -> List[tiledb.Attr]:
         )
         return [image_uri_attr, location_attr]
 
-    def get_partitions(
-        self, images_per_partitions: int = -1, **kwargs
-    ) -> List[BioImagePartition]:
-        if images_per_partitions == -1:
-            images_per_partitions = 1
-        if self.images is None:
-            vfs = tiledb.VFS(config=self.config)
-            self.images = vfs.ls(self.uri)[1:]
-        num_images = len(self.images)
-        partitions = []
-        partition_id = 0
-        for start in range(0, num_images, images_per_partitions):
-            end = start + images_per_partitions
-            if end > num_images:
-                end = num_images
-            partitions.append(
-                BioImagePartition(
-                    partition_id,
-                    image_uris=self.images[start:end],
-                    image_id_start=start,
-                )
-            )
-            partition_id += 1
-        return partitions
-
     def read_objects(
-        self, partition: BioImagePartition
+        self, partition: DirectoryPartition
     ) -> Tuple[OrderedDict, OrderedDict]:
         from tiledb.bioimg.openslide import TileDBOpenSlide
 
-        def compute_external_id() -> int:
-            id = image_id * MAX_IMAGE_CROPS_PER_IMAGE + image_iter_id
-            return id
-
-        def crop_image(dim_0_start, dim_0_end, dim_1_start, dim_1_end):
+        def crop_image(path, dim_0_start, dim_0_end, dim_1_start, dim_1_end):
             cropped_image = image[dim_0_start:dim_0_end, dim_1_start:dim_1_end]
             images[write_id] = cropped_image.flatten()
             shapes[write_id] = np.array(cropped_image.shape, dtype=np.uint32)
-            image_uris[write_id] = image_uri
+            image_uris[write_id] = path
             locations[write_id] = np.array(
                 [dim_0_start, dim_0_end, dim_1_start, dim_1_end], dtype=np.uint32
             )
-            external_ids[write_id] = compute_external_id()
-
-        with tiledb.scope_ctx(ctx_or_config=self.config):
-            max_size = MAX_IMAGE_CROPS_PER_IMAGE * len(partition.image_uris)
-            images = np.empty(max_size, dtype="O")
-            shapes = np.empty(max_size, dtype="O")
-            external_ids = np.zeros(max_size, dtype=np.uint64)
-            image_uris = np.empty(max_size, dtype="O")
-            locations = np.empty(max_size, dtype="O")
-            write_id = 0
-            image_id = partition.image_id_start
-            for image_uri in partition.image_uris:
-                image_iter_id = 0
-                slide = TileDBOpenSlide(image_uri)
-                level_dimensions = slide.level_dimensions[self.level]
-                image = slide.read_region((0, 0), self.level, level_dimensions)
-                if self.object_crop_shape is None:
-                    crop_image(0, level_dimensions[1], 0, level_dimensions[0])
-                    write_id += 1
-                else:
-                    for dim_0_start in range(
-                        0, level_dimensions[1], self.object_crop_shape[0]
-                    ):
-                        for dim_1_start in range(
-                            0, level_dimensions[0], self.object_crop_shape[1]
-                        ):
-                            dim_0_end = min(
-                                dim_0_start + self.object_crop_shape[0],
-                                level_dimensions[1],
-                            )
-                            dim_1_end = min(
-                                dim_1_start + self.object_crop_shape[1],
-                                level_dimensions[0],
-                            )
-                            crop_image(dim_0_start, dim_0_end, dim_1_start, dim_1_end)
-                            write_id += 1
-                            image_iter_id += 1
-                image_id += 1
-            return (
-                {
-                    "image": images[0:write_id],
-                    "shape": shapes[0:write_id],
-                    "external_id": external_ids[0:write_id],
-                },
-                {
-                    "image_uri": image_uris[0:write_id],
-                    "location": locations[0:write_id],
-                    "external_id": external_ids[0:write_id],
-                },
-            )
-
-    def read_objects_by_external_ids(self, ids: List[int]) -> OrderedDict:
-        from tiledb.bioimg.openslide import TileDBOpenSlide
-
-        def crop_image():
-            i = 0
+            external_ids[write_id] = abs(hash(f"{path}_{dim_0_start}_{dim_1_start}"))
+
+        max_size = MAX_IMAGE_CROPS_PER_IMAGE * len(partition.paths)
+        images = np.empty(max_size, dtype="O")
+        shapes = np.empty(max_size, dtype="O")
+        external_ids = np.zeros(max_size, dtype=np.uint64)
+        image_uris = np.empty(max_size, dtype="O")
+        locations = np.empty(max_size, dtype="O")
+        write_id = 0
+        for path in partition.paths:
+            slide = TileDBOpenSlide(path)
+            level_dimensions = slide.level_dimensions[self.level]
+            image = slide.read_region((0, 0), self.level, level_dimensions)
             if self.object_crop_shape is None:
-                if image_iter_id == i:
-                    images[write_id] = image.flatten()
-                    shapes[write_id] = np.array(image.shape, dtype=np.uint32)
-                    external_ids[write_id] = external_id
-                    return
+                crop_image(path, 0, level_dimensions[1], 0, level_dimensions[0])
+                write_id += 1
             else:
                 for dim_0_start in range(
                     0, level_dimensions[1], self.object_crop_shape[0]
                 ):
                     for dim_1_start in range(
                         0, level_dimensions[0], self.object_crop_shape[1]
                     ):
-                        if image_iter_id == i:
-                            dim_0_end = min(
-                                dim_0_start + self.object_crop_shape[0],
-                                level_dimensions[1],
-                            )
-                            dim_1_end = min(
-                                dim_1_start + self.object_crop_shape[1],
-                                level_dimensions[0],
-                            )
-                            cropped_image = image[
-                                dim_0_start:dim_0_end, dim_1_start:dim_1_end
-                            ]
-                            images[write_id] = cropped_image.flatten()
-                            shapes[write_id] = np.array(
-                                cropped_image.shape, dtype=np.uint32
-                            )
-                            external_ids[write_id] = external_id
-                            return
-                        i += 1
-
-        with tiledb.scope_ctx(ctx_or_config=self.config):
-            size = len(ids)
-            images = np.empty(size, dtype="O")
-            shapes = np.empty(size, dtype="O")
-            external_ids = np.zeros(size, dtype=np.uint64)
-            if self.images is None:
-                vfs = tiledb.VFS(config=self.config)
-                self.images = vfs.ls(self.uri)[1:]
-
-            image_id = -1
-            write_id = 0
-            for external_id in ids:
-                new_image_id = external_id // MAX_IMAGE_CROPS_PER_IMAGE
-                image_iter_id = external_id % MAX_IMAGE_CROPS_PER_IMAGE
-                if new_image_id != image_id:
-                    # Load image
-                    image_id = new_image_id
-                    slide = TileDBOpenSlide(self.images[image_id])
-                    level_dimensions = slide.level_dimensions[self.level]
-                    image = slide.read_region((0, 0), self.level, level_dimensions)
-                crop_image()
-                write_id += 1
-            return {"image": images, "shape": shapes, "external_id": external_ids}
+                        dim_0_end = min(
+                            dim_0_start + self.object_crop_shape[0],
+                            level_dimensions[1],
+                        )
+                        dim_1_end = min(
+                            dim_1_start + self.object_crop_shape[1],
+                            level_dimensions[0],
+                        )
+                        crop_image(path, dim_0_start, dim_0_end, dim_1_start, dim_1_end)
+                        write_id += 1
+        return (
+            {
+                "image": images[0:write_id],
+                "shape": shapes[0:write_id],
+                "external_id": external_ids[0:write_id],
+            },
+            {
+                "image_uri": image_uris[0:write_id],
+                "location": locations[0:write_id],
+                "external_id": external_ids[0:write_id],
+            },
+        )
diff --git a/apis/python/src/tiledb/vector_search/object_readers/directory_reader.py b/apis/python/src/tiledb/vector_search/object_readers/directory_reader.py