Adds ability to provide other OpenCLIP models and checkpoints (#33)

hlapp · web-flow · commit c8d205109138 · 2024-08-15T14:43:42.000-04:00
For now this will only work with predicting on custom classes. For
the full tree-of-life, the embeddings for all text labels would need
to have been pre-computed and made available for download and caching.

Also updates documentation for command line arguments.
diff --git a/src/bioclip/__main__.py b/src/bioclip/__main__.py
@@ -1,4 +1,6 @@
 from bioclip import TreeOfLifeClassifier, Rank, CustomLabelsClassifier
+from .predict import BIOCLIP_MODEL_STR
+import open_clip as oc
 import json
 import sys
 import prettytable as pt
@@ -29,20 +31,25 @@ def write_results_to_file(df, format, outfile):
         raise ValueError(f"Invalid format: {format}")
 
 
-def predict(image_file: list[str], format: str,  output: str,
-             cls_str: str, device: str,  rank: Rank, k: int):
+def predict(image_file: list[str],
+            format: str,
+            output: str,
+            cls_str: str,
+            rank: Rank,
+            k: int,
+            **kwargs):
     if cls_str:
-        classifier = CustomLabelsClassifier(cls_ary=cls_str.split(','), device=device)
+        classifier = CustomLabelsClassifier(cls_ary=cls_str.split(','), **kwargs)
         predictions = classifier.predict(image_paths=image_file, k=k)
         write_results(predictions, format, output)
     else:
-        classifier = TreeOfLifeClassifier(device=device)
+        classifier = TreeOfLifeClassifier(**kwargs)
         predictions = classifier.predict(image_paths=image_file, rank=rank, k=k)
         write_results(predictions, format, output)
 
 
-def embed(image_file: list[str], output: str, device: str):
-    classifier = TreeOfLifeClassifier(device=device)
+def embed(image_file: list[str], output: str, **kwargs):
+    classifier = TreeOfLifeClassifier(**kwargs)
     images_dict = {}
     data = {
         "model": classifier.model_str,
@@ -62,22 +69,42 @@ def create_parser():
     parser = argparse.ArgumentParser(prog='bioclip', description='BioCLIP command line interface')
     subparsers = parser.add_subparsers(title='commands', dest='command')
 
+    device_arg = {'default':'cpu', 'help': 'device to use (cpu or cuda or mps), default: cpu'}
+    output_arg = {'default': 'stdout', 'help': 'print output to file, default: stdout'}
+    model_arg = {'help': f'model identifier (see command list-models); default: {BIOCLIP_MODEL_STR}'}
+    pretrained_arg = {'help': 'pretrained model checkpoint as tag or file, depends on model; '
+                              'needed only if more than one is available (see command list-models)'}
+
     # Predict command
     predict_parser = subparsers.add_parser('predict', help='Use BioCLIP to generate predictions for image files.')
     predict_parser.add_argument('image_file', nargs='+', help='input image file(s)')
     predict_parser.add_argument('--format', choices=['table', 'csv'], default='csv', help='format of the output, default: csv')
-    predict_parser.add_argument('--output', default='stdout', help='print output to file, default: stdout')
+    predict_parser.add_argument('--output', **output_arg)
     predict_parser.add_argument('--rank', choices=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'],
                                 help='rank of the classification, default: species (when)')
     predict_parser.add_argument('--k', type=int, help='number of top predictions to show, default: 5')
-    predict_parser.add_argument('--cls', help='comma separated list of classes to predict, when specified the --rank and --k arguments are not allowed')
-    predict_parser.add_argument('--device', help='device to use (cpu or cuda or mps), default: cpu', default='cpu')
+    predict_parser.add_argument('--cls', help='comma separated list of classes to predict, when specified the --rank argument is not allowed')
+    predict_parser.add_argument('--device', **device_arg)
+    predict_parser.add_argument('--model', **model_arg)
+    predict_parser.add_argument('--pretrained', **pretrained_arg)
 
     # Embed command
     embed_parser = subparsers.add_parser('embed', help='Use BioCLIP to generate embeddings for image files.')
     embed_parser.add_argument('image_file', nargs='+', help='input image file(s)')
-    embed_parser.add_argument('--output', default='stdout', help='print output to file, default: stdout')
-    embed_parser.add_argument('--device', help='device to use (cpu or cuda or mps), default: cpu', default='cpu')
+    embed_parser.add_argument('--output', **output_arg)
+    embed_parser.add_argument('--device', **device_arg)
+    embed_parser.add_argument('--model', **model_arg)
+    embed_parser.add_argument('--pretrained', **pretrained_arg)
+
+    # List command
+    list_parser = subparsers.add_parser('list-models',
+                                        help='List available models and pretrained model checkpoints.',
+                                        description=
+                                             'Note that this will only list models known to open_clip; '
+                                             'any model identifier loadable by open_clip, such as from hf-hub, file, etc '
+                                             'should also be usable for --model in the embed and predict commands. '
+                                             f'(The default model {BIOCLIP_MODEL_STR} is one example.)')
+    list_parser.add_argument('--model', help='list available tags for pretrained model checkpoint(s) for specified model')
 
     return parser
 
@@ -91,6 +118,8 @@ def parse_args(input_args=None):
                 raise ValueError("Cannot use --cls with --rank")
         else:
             # tree of life class list mode
+            if args.model or args.pretrained:
+                raise ValueError("Custom model or checkpoints currently not supported for Tree-of-Life prediction")
             if not args.rank:
                 args.rank = 'species'
             args.rank = Rank[args.rank.upper()]
@@ -102,9 +131,28 @@ def parse_args(input_args=None):
 def main():
     args = parse_args()
     if args.command == 'embed':
-        embed(args.image_file, args.output, args.device)
+        embed(args.image_file,
+              args.output,
+              device=args.device,
+              model_str=args.model,
+              pretrained_str=args.pretrained)
     elif args.command == 'predict':
-        predict(args.image_file, args.format, args.output, args.cls, args.device, args.rank, args.k)
+        predict(args.image_file,
+                format=args.format,
+                output=args.output,
+                cls_str=args.cls,
+                rank=args.rank,
+                k=args.k,
+                device=args.device,
+                model_str=args.model,
+                pretrained_str=args.pretrained)
+    elif args.command == 'list-models':
+        if args.model:
+            for tag in oc.list_pretrained_tags_by_model(args.model):
+                print(tag)
+        else:
+            for model_str in oc.list_models():
+                print(f"\t{model_str}")
     else:
         raise ValueError("Invalid command")
 
diff --git a/src/bioclip/predict.py b/src/bioclip/predict.py
@@ -1,7 +1,7 @@
 import json
 import torch
 from torchvision import transforms
-from open_clip import create_model, get_tokenizer
+import open_clip as oc
 import torch.nn.functional as F
 import numpy as np
 import collections
@@ -14,7 +14,7 @@
 
 HF_DATAFILE_REPO = "imageomics/bioclip-demo"
 HF_DATAFILE_REPO_TYPE = "space"
-MODEL_STR = "hf-hub:imageomics/bioclip"
+BIOCLIP_MODEL_STR = "hf-hub:imageomics/bioclip"
 PRED_FILENAME_KEY = "file_name"
 PRED_CLASSICATION_KEY = "classification"
 PRED_SCORE_KEY = "score"
@@ -139,14 +139,8 @@ def get_label(self):
 COMMON_NAME_LABEL = "common_name"
 
 
-def create_bioclip_model(model_str, device="cuda"):
-    model = create_model(model_str, output_dict=True, require_pretrained=True)
-    model = model.to(device)
-    return torch.compile(model)
-
-
-def create_bioclip_tokenizer(tokenizer_str="ViT-B-16"):
-    return get_tokenizer(tokenizer_str)
+def create_bioclip_tokenizer(model_name="ViT-B-16"):
+    return oc.get_tokenizer(model_name=model_name)
 
 
 preprocess_img = transforms.Compose(
@@ -162,10 +156,23 @@ def create_bioclip_tokenizer(tokenizer_str="ViT-B-16"):
 
 
 class BaseClassifier(object):
-    def __init__(self, device: Union[str, torch.device] = 'cpu', model_str: str = MODEL_STR):
+    def __init__(self, model_str: str = BIOCLIP_MODEL_STR, pretrained_str: str | None = None, device: Union[str, torch.device] = 'cpu'):
         self.device = device
-        self.model = create_bioclip_model(device=device, model_str=model_str)
-        self.model_str = model_str
+        self.load_pretrained_model(model_str=model_str, pretrained_str=pretrained_str)
+
+    def load_pretrained_model(self, model_str: str = BIOCLIP_MODEL_STR, pretrained_str: str | None = None):
+        self.model_str = model_str or BIOCLIP_MODEL_STR
+        pretrained_tags = oc.list_pretrained_tags_by_model(self.model_str)
+        if pretrained_str is None and len(pretrained_tags) > 0:
+            if len(pretrained_tags) > 1:
+                raise ValueError(f"Multiple pretrained tags available {pretrained_tags}, must provide one")
+            pretrained_str = pretrained_tags[0]
+        model, preprocess = oc.create_model_from_pretrained(self.model_str,
+                                                            pretrained=pretrained_str,
+                                                            device=self.device,
+                                                            return_transform=True)
+        self.model = torch.compile(model.to(self.device))
+        self.preprocess = preprocess_img if self.model_str == BIOCLIP_MODEL_STR else preprocess
 
     @staticmethod
     def open_image(image_path):
@@ -176,7 +183,7 @@ def open_image(image_path):
     def create_image_features(self, images: List[PIL.Image.Image], normalize : bool = True) -> torch.Tensor:
         preprocessed_images = []
         for img in images:
-            prep_img = preprocess_img(img).to(self.device)
+            prep_img = self.preprocess(img).to(self.device)
             preprocessed_images.append(prep_img)
         preprocessed_image_tensor = torch.stack(preprocessed_images)
         img_features = self.model.encode_image(preprocessed_image_tensor)
@@ -209,9 +216,9 @@ def create_probabilities_for_image_paths(self, image_paths: List[str] | str,
 
 
 class CustomLabelsClassifier(BaseClassifier):
-    def __init__(self, cls_ary: List[str], device: Union[str, torch.device] = 'cpu', model_str: str = MODEL_STR):
-        super().__init__(device=device, model_str=model_str)
-        self.tokenizer = create_bioclip_tokenizer()
+    def __init__(self, cls_ary: List[str], **kwargs):
+        super().__init__(**kwargs)
+        self.tokenizer = create_bioclip_tokenizer(self.model_str)
         self.classes = [cls.strip() for cls in cls_ary]
         self.txt_features = self._get_txt_features(self.classes)
 
@@ -286,9 +293,9 @@ def join_names(classification_dict: dict[str, str]) -> str:
 
 
 class TreeOfLifeClassifier(BaseClassifier):
-    def __init__(self, device: Union[str, torch.device] = 'cpu', model_str: str = MODEL_STR):
-        super().__init__(device=device, model_str=model_str)
-        self.txt_features = get_txt_emb().to(device)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.txt_features = get_txt_emb().to(self.device)
         self.txt_names = get_txt_names()
 
     def format_species_probs(self, image_path: str, probs: torch.Tensor, k: int = 5) -> List[dict[str, float]]: