Merge pull request #10 from Imageomics/embed

johnbradley · web-flow · commit 88eee311adcd · 2024-07-01T10:46:50.000-04:00
Add embed command
diff --git a/README.md b/README.md
@@ -103,21 +103,24 @@ bear 1.0
 
 ## Command Line Usage
 ```
-bioclip predict [options] [IMAGE_FILE...]
+bioclip predict [-h] [--format {table,csv}] [--output OUTPUT] [--rank {kingdom,phylum,class,order,family,genus,species}] [--k K] [--cls CLS] [--device DEVICE] image_file [image_file ...]
+bioclip embed [-h] [--device=DEVICE] [--output=OUTPUT] [IMAGE_FILE...]
+
+Commands:
+    predict            Use BioCLIP to generate predictions for image files.
+    embed              Use BioCLIP to generate embeddings for image files.
 
 Arguments:
-  IMAGE_FILE         input image file
+  IMAGE_FILE           input image file
 
 Options:
   -h --help
-  --format=FORMAT    format of the output (table or csv) [default: csv]
-  --rank=RANK        rank of the classification (kingdom, phylum, class, order, family, genus, species)
-                     [default: species] 
-  --k=K              number of top predictions to show [default: 5]
-  --cls=CLS          comma separated list of classes to predict, when specified the --rank and
-                     --k arguments are ignored [default: all]
-  --device=DEVICE    device to use for prediction (cpu or cuda or mps) [default: cpu]
-  --output=OUTFILE   print output to file OUTFILE [default: stdout]
+  --format=FORMAT      format of the output (table or csv) for predict mode [default: csv]
+  --rank=RANK          rank of the classification (kingdom, phylum, class, order, family, genus, species) [default: species] 
+  --k=K                number of top predictions to show [default: 5]
+  --cls=CLS            comma separated list of classes to predict, when specified the --rank and --k arguments are not allowed
+  --device=DEVICE      device to use matrix math (cpu or cuda or mps) [default: cpu]
+  --output=OUTFILE     print output to file OUTFILE [default: stdout]
 ```
 
 ### Predict classification
@@ -191,6 +194,28 @@ Ursus-arctos.jpeg,bird,3.051998476166773e-08
 Ursus-arctos.jpeg,bear,0.9999998807907104                                                                 
 ```
 
+### Create embeddings
+
+#### Create embedding for an image
+
+```console
+bioclip embed Ursus-arctos.jpeg
+```
+Output:
+```
+{
+    "model": "hf-hub:imageomics/bioclip",
+    "embeddings": {
+        "Ursus-arctos.jpeg": [
+            -0.23633578419685364,
+            -0.28467196226119995,
+            -0.4394485652446747,
+            ...
+        ]
+    }
+}
+```
+
 ### View command line help
 ```console
 bioclip --help
diff --git a/src/bioclip/__main__.py b/src/bioclip/__main__.py
@@ -1,27 +1,9 @@
-"""Usage: bioclip predict [options] [IMAGE_FILE...]
-
-Use BioCLIP to generate predictions for an IMAGE_FILE.
-
-Arguments:
-  IMAGE_FILE           input image file
-
-Options:
-  -h --help
-  --format=FORMAT      format of the output (table or csv) [default: csv]
-  --rank=RANK          rank of the classification (kingdom, phylum, class, order, family, genus, species) [default: species] 
-  --k=K                number of top predictions to show [default: 5]
-  --cls=CLS            comma separated list of classes to predict, when specified the --rank and --k arguments are ignored [default: all]
-  --device=DEVICE      device to use for prediction (cpu or cuda or mps) [default: cpu]
-  --output=OUTFILE     print output to file OUTFILE [default: stdout]
-
-"""
-from docopt import docopt
 from bioclip import TreeOfLifeClassifier, Rank, CustomLabelsClassifier
 import json
 import sys
 import prettytable as pt
-import csv
 import pandas as pd
+import argparse
 
 
 def write_results(data, format, output):
@@ -46,33 +28,89 @@ def write_results_to_file(df, format, outfile):
     else:
         raise ValueError(f"Invalid format: {format}")
 
-
-def main():
-    # execute only if run as the entry point into the program
-    x = docopt(__doc__)  # parse arguments based on docstring above
-    format = x['--format']
-    output = x['--output']
-    image_file = x['IMAGE_FILE']
-    device = 'cpu'
-    if x['--device']:
-        device = x['--device']
-    cls = x['--cls']
-    if not format in ['table', 'csv']:
-        raise ValueError(f"Invalid format: {format}")
-    rank = Rank[x['--rank'].upper()]
-    if cls == 'all':
-        classifier = TreeOfLifeClassifier(device=device)
+def predict(image_file: list[str], format: str,  output: str,
+             cls_str: str, device: str,  rank: Rank, k: int):
+    if cls_str:
+        classifier = CustomLabelsClassifier(device=device)
         data = []
         for image_path in image_file:
-            data.extend(classifier.predict(image_path=image_path, rank=rank, k=int(x['--k'])))
+            data.extend(classifier.predict(image_path=image_path, cls_ary=cls_str.split(',')))
         write_results(data, format, output)
     else:
-        classifier = CustomLabelsClassifier(device=device)
+        classifier = TreeOfLifeClassifier(device=device)
         data = []
         for image_path in image_file:
-            data.extend(classifier.predict(image_path=image_path, cls_ary=cls.split(',')))
+            data.extend(classifier.predict(image_path=image_path, rank=rank, k=k))
         write_results(data, format, output)
 
 
+def embed(image_file: list[str], output: str, device: str):
+    classifier = TreeOfLifeClassifier(device=device)
+    images_dict = {}
+    data = {
+        "model": classifier.model_str,
+        "embeddings": images_dict
+    }
+    for image_path in image_file:
+        features = classifier.get_image_features(image_path)[0]
+        images_dict[image_path] = features.tolist()
+    if output == 'stdout':
+        print(json.dumps(data, indent=4))
+    else:
+        with open(output, 'w') as outfile:
+            json.dump(data, outfile, indent=4) 
+        
+
+def create_parser():
+    parser = argparse.ArgumentParser(prog='bioclip', description='BioCLIP command line interface')
+    subparsers = parser.add_subparsers(title='commands', dest='command')
+
+    # Predict command
+    predict_parser = subparsers.add_parser('predict', help='Use BioCLIP to generate predictions for image files.')
+    predict_parser.add_argument('image_file', nargs='+', help='input image file(s)')
+    predict_parser.add_argument('--format', choices=['table', 'csv'], default='csv', help='format of the output, default: csv')
+    predict_parser.add_argument('--output', default='stdout', help='print output to file, default: stdout')
+    predict_parser.add_argument('--rank', choices=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'],
+                                help='rank of the classification, default: species (when)')
+    predict_parser.add_argument('--k', type=int, help='number of top predictions to show, default: 5')
+    predict_parser.add_argument('--cls', help='comma separated list of classes to predict, when specified the --rank and --k arguments are not allowed')
+    predict_parser.add_argument('--device', help='device to use (cpu or cuda or mps), default: cpu', default='cpu')
+
+    # Embed command
+    embed_parser = subparsers.add_parser('embed', help='Use BioCLIP to generate embeddings for image files.')
+    embed_parser.add_argument('image_file', nargs='+', help='input image file(s)')
+    embed_parser.add_argument('--output', default='stdout', help='print output to file, default: stdout')
+    embed_parser.add_argument('--device', help='device to use (cpu or cuda or mps), default: cpu', default='cpu')
+
+    return parser
+
+
+def parse_args(input_args=None):
+    args = create_parser().parse_args(input_args)
+    if args.command == 'predict':
+        if args.cls:
+            # custom class list mode
+            if args.rank or args.k:
+                raise ValueError("Cannot use --cls with --rank or --k")
+        else:
+            # tree of life class list mode
+            if not args.rank:
+                args.rank = 'species'
+            args.rank = Rank[args.rank.upper()]
+            if not args.k:
+                args.k = 5
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.command == 'embed':
+        embed(args.image_file, args.output, args.device)
+    elif args.command == 'predict':
+        predict(args.image_file, args.format, args.output, args.cls, args.device, args.rank, args.k)
+    else:
+        raise ValueError("Invalid command")
+
+
 if __name__ == '__main__':
     main()
diff --git a/src/bioclip/predict.py b/src/bioclip/predict.py
@@ -14,6 +14,7 @@
 
 HF_DATAFILE_REPO = "imageomics/bioclip-demo"
 HF_DATAFILE_REPO_TYPE = "space"
+MODEL_STR = "hf-hub:imageomics/bioclip"
 PRED_FILENAME_KEY = "file_name"
 PRED_CLASSICATION_KEY = "classification"
 PRED_SCORE_KEY = "score"
@@ -149,7 +150,7 @@ def get_label(self):
 COMMON_NAME_LABEL = "common_name"
 
 
-def create_bioclip_model(model_str="hf-hub:imageomics/bioclip", device="cuda"):
+def create_bioclip_model(model_str, device="cuda"):
     model = create_model(model_str, output_dict=True, require_pretrained=True)
     model = model.to(device)
     return torch.compile(model)
@@ -160,9 +161,10 @@ def create_bioclip_tokenizer(tokenizer_str="ViT-B-16"):
 
 
 class CustomLabelsClassifier(object):
-    def __init__(self, device: Union[str, torch.device] = 'cpu'):
+    def __init__(self, device: Union[str, torch.device] = 'cpu', model_str: str = MODEL_STR):
         self.device = device
-        self.model = create_bioclip_model(device=device)
+        self.model = create_bioclip_model(device=device, model_str=model_str)
+        self.model_str = model_str
         self.tokenizer = create_bioclip_tokenizer()
 
     def get_txt_features(self, classnames):
@@ -237,12 +239,18 @@ def join_names(classification_dict: dict[str, str]) -> str:
 
 
 class TreeOfLifeClassifier(object):
-    def __init__(self, device: Union[str, torch.device] = 'cpu'):
+    def __init__(self, device: Union[str, torch.device] = 'cpu', model_str: str = MODEL_STR):
         self.device = device
-        self.model = create_bioclip_model(device=device)
+        self.model = create_bioclip_model(device=device, model_str=model_str)
+        self.model_str = model_str
         self.txt_emb = get_txt_emb().to(device)
         self.txt_names = get_txt_names()
 
+    @torch.no_grad()
+    def get_image_features(self, image_path: str) -> torch.Tensor:
+        img = PIL.Image.open(image_path)
+        return self.encode_image(img)
+
     def encode_image(self, img: PIL.Image.Image) -> torch.Tensor:
         img = preprocess_img(img).to(self.device)
         img_features = self.model.encode_image(img.unsqueeze(0))
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -0,0 +1,61 @@
+import unittest
+from bioclip.__main__ import parse_args, Rank
+
+
+class TestParser(unittest.TestCase):
+    def test_parse_args(self):
+
+        args = parse_args(['predict', 'image.jpg'])
+        self.assertEqual(args.command, 'predict')
+        self.assertEqual(args.image_file, ['image.jpg'])
+        self.assertEqual(args.format, 'csv')
+        self.assertEqual(args.output, 'stdout')
+        self.assertEqual(args.rank, Rank.SPECIES)
+        self.assertEqual(args.k, 5)
+        self.assertEqual(args.cls, None)
+        self.assertEqual(args.device, 'cpu')
+
+        args = parse_args(['predict', 'image.jpg', 'image2.png'])
+        self.assertEqual(args.command, 'predict')
+        self.assertEqual(args.image_file, ['image.jpg', 'image2.png'])
+
+        # test tree of life version of predict
+        args = parse_args(['predict', 'image.jpg', '--format', 'table', '--output', 'output.csv', '--rank', 'genus', '--k', '10', '--device', 'cuda'])
+        self.assertEqual(args.command, 'predict')
+        self.assertEqual(args.image_file, ['image.jpg'])
+        self.assertEqual(args.format, 'table')
+        self.assertEqual(args.output, 'output.csv')
+        self.assertEqual(args.rank, Rank.GENUS)
+        self.assertEqual(args.k, 10)
+        self.assertEqual(args.cls, None)
+        self.assertEqual(args.device, 'cuda')
+
+        # test custom class list version of predict
+        args = parse_args(['predict', 'image.jpg', '--format', 'table', '--output', 'output.csv', '--cls', 'class1,class2', '--device', 'cuda'])
+        self.assertEqual(args.command, 'predict')
+        self.assertEqual(args.image_file, ['image.jpg'])
+        self.assertEqual(args.format, 'table')
+        self.assertEqual(args.output, 'output.csv')
+        self.assertEqual(args.rank, None) # default ignored for the --cls variation
+        self.assertEqual(args.k, None)
+        self.assertEqual(args.cls, 'class1,class2')
+        self.assertEqual(args.device, 'cuda')
+
+        # test error when using --cls with --rank
+        with self.assertRaises(ValueError):
+            parse_args(['predict', 'image.jpg', '--cls', 'class1,class2', '--rank', 'genus'])
+        # test error when using --cls with --k
+        with self.assertRaises(ValueError):
+            parse_args(['predict', 'image.jpg', '--cls', 'class1,class2', '--k', '10'])
+
+        args = parse_args(['embed', 'image.jpg'])
+        self.assertEqual(args.command, 'embed')
+        self.assertEqual(args.image_file, ['image.jpg'])
+        self.assertEqual(args.output, 'stdout')
+        self.assertEqual(args.device, 'cpu')
+
+        args = parse_args(['embed', '--output', 'data.json', '--device', 'cuda', 'image.jpg', 'image2.png'])
+        self.assertEqual(args.command, 'embed')
+        self.assertEqual(args.image_file, ['image.jpg', 'image2.png'])
+        self.assertEqual(args.output, 'data.json')
+        self.assertEqual(args.device, 'cuda')
diff --git a/tests/test_predict.py b/tests/test_predict.py
@@ -2,6 +2,8 @@
 from bioclip.predict import TreeOfLifeClassifier, Rank
 from bioclip.predict import CustomLabelsClassifier
 import os
+import torch
+
 
 DIRNAME = os.path.dirname(os.path.realpath(__file__))
 EXAMPLE_CAT_IMAGE = os.path.join(DIRNAME, "images", "mycat.jpg")
@@ -48,3 +50,11 @@ def test_custom_labels_classifier(self):
             {'file_name': EXAMPLE_CAT_IMAGE, 'classification': 'cat', 'score': unittest.mock.ANY},
             {'file_name': EXAMPLE_CAT_IMAGE, 'classification': 'dog', 'score': unittest.mock.ANY},
         ])
+
+
+class TestEmbed(unittest.TestCase):
+    def test_get_image_features(self):
+        classifier = TreeOfLifeClassifier(device='cpu')
+        self.assertEqual(classifier.model_str, 'hf-hub:imageomics/bioclip')
+        features = classifier.get_image_features(EXAMPLE_CAT_IMAGE)
+        self.assertEqual(features.shape, torch.Size([1, 512]))