visual-layer
diff --git a/‎fastdup/__init__.py‎
Lines changed: 149 additions & 47 deletions b/‎fastdup/__init__.py‎
Lines changed: 149 additions & 47 deletions
diff --git a/‎fastdup/captions.py‎
Lines changed: 181 additions & 0 deletions b/‎fastdup/captions.py‎
Lines changed: 181 additions & 0 deletions
diff --git a/‎fastdup/definitions.py‎
Lines changed: 91 additions & 1 deletion b/‎fastdup/definitions.py‎
Lines changed: 91 additions & 1 deletion
diff --git a/‎fastdup/engine.py‎
Lines changed: 4 additions & 3 deletions b/‎fastdup/engine.py‎
Lines changed: 4 additions & 3 deletions
@@ -0,0 +1,181 @@
+
+from fastdup.sentry import fastdup_capture_exception
+from fastdup.definitions import MISSING_LABEL
+from fastdup.galleries import fastdup_imread
+from tqdm import tqdm
+import cv2
+
+def generate_labels(filenames, kwargs):
+    try:
+        from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+        import torch
+    except Exception as e:
+        fastdup_capture_exception("Auto generate labels", e)
+        print("For auto captioning images need to install transforms and torch packages using `pip install transformers torch`")
+        return [MISSING_LABEL]*len(filenames)
+
+    try:
+        from PIL import Image
+        model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+        feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+        tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model.to(device)
+        max_length = 16
+        num_beams = 4
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+
+        images = []
+        for image_path in tqdm(filenames):
+            i_image = fastdup_imread(image_path, None, kwargs=kwargs)
+            if i_image is not None:
+                i_image = cv2.cvtColor(i_image, cv2.COLOR_BGR2RGB)
+                im_pil = Image.fromarray(i_image)
+                images.append(im_pil)
+            else:
+                images.append(None)
+
+        pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
+        pixel_values = pixel_values.to(device)
+        output_ids = model.generate(pixel_values, **gen_kwargs)
+
+        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        preds = [pred.strip() for pred in preds]
+        return preds
+    except Exception as e:
+        fastdup_capture_exception("Auto caption image", e)
+        return [MISSING_LABEL]*len(filenames)
+
+def generate_blip_labels(filenames, kwargs):
+
+    try:
+        from transformers import BlipProcessor, BlipForConditionalGeneration
+        from PIL import Image
+    except Exception as e:
+        fastdup_capture_exception("Auto generate labels", e)
+        print("For auto captioning images need to install transforms and torch packages using `pip install transformers`")
+        return [MISSING_LABEL] * len(filenames)
+
+    try:
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+        preds = []
+        for image_path in tqdm(filenames):
+            i_image = fastdup_imread(image_path, None, kwargs=kwargs)
+            if i_image is not None:
+                i_image = cv2.cvtColor(i_image, cv2.COLOR_BGR2RGB)
+                im_pil = Image.fromarray(i_image)
+                inputs = processor(im_pil, return_tensors="pt")
+                out = model.generate(**inputs)
+                preds.append((processor.decode(out[0], skip_special_tokens=True)))
+            else:
+                preds.append(MISSING_LABEL)
+        return preds
+
+    except Exception as e:
+        fastdup_capture_exception("Auto caption image blip", e)
+        return [MISSING_LABEL]*len(filenames)
+
+def generate_blip2_labels(filenames, kwargs, text=None):
+
+    try:
+        from transformers import Blip2Processor, Blip2Model
+        from PIL import Image
+        import torch
+    except Exception as e:
+        fastdup_capture_exception("Auto generate labels", e)
+        print("For auto captioning images need to install transforms and torch packages using `pip install transformers torch`")
+        return [MISSING_LABEL] * len(filenames)
+
+    try:
+
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model.to(device)
+        preds = []
+        for image_path in tqdm(filenames):
+            i_image = fastdup_imread(image_path, None, kwargs=kwargs)
+            if i_image is not None:
+                i_image = cv2.cvtColor(i_image, cv2.COLOR_BGR2RGB)
+                im_pil = Image.fromarray(i_image)
+                inputs = processor(images=im_pil, text=text, return_tensors="pt").to(device, torch.float16)
+                generated_ids = model.generate(**inputs)
+                generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+                preds.append(generated_text)
+            else:
+                preds.append(MISSING_LABEL)
+        return preds
+
+    except Exception as e:
+        fastdup_capture_exception("Auto caption image blip", e)
+        return [MISSING_LABEL]*len(filenames)
+
+
+
+
+
+
+def generate_vqa_labels(filenames, text, kwargs):
+    try:
+        from transformers import ViltProcessor, ViltForQuestionAnswering
+        from PIL import Image
+    except Exception as e:
+        fastdup_capture_exception("Auto generate labels", e)
+        print(
+            "For auto captioning images need to install transforms and torch packages using `pip install transformers`")
+        return [MISSING_LABEL] * len(filenames)
+
+    try:
+        preds = []
+        processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+        model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+        for image_path in tqdm(filenames):
+            i_image = fastdup_imread(image_path, None, kwargs=kwargs)
+            if i_image is not None:
+                i_image = cv2.cvtColor(i_image, cv2.COLOR_BGR2RGB)
+                im_pil = Image.fromarray(i_image)
+                encoding = processor(im_pil, text, return_tensors="pt")
+
+                # forward pass
+                outputs = model(**encoding)
+                logits = outputs.logits
+                idx = logits.argmax(-1).item()
+                preds.append(model.config.id2label[idx])
+            else:
+                preds.append(MISSING_LABEL)
+
+        return preds
+
+    except Exception as e:
+        fastdup_capture_exception("Auto caption image vqa", e)
+        return [MISSING_LABEL]*len(filenames)
+
+
+def generate_age_labels(filenames, kwargs):
+    from transformers import ViTFeatureExtractor, ViTForImageClassification
+    model = ViTForImageClassification.from_pretrained('nateraw/vit-age-classifier')
+    transforms = ViTFeatureExtractor.from_pretrained('nateraw/vit-age-classifier')
+
+    try:
+        preds = []
+        # Get example image from official fairface repo + read it in as an image
+        for image_path in tqdm(filenames):
+            i_image = fastdup_imread(image_path, None, kwargs=kwargs)
+            # Init model, transforms
+
+            # Transform our image and pass it through the model
+            inputs = transforms(i_image, return_tensors='pt')
+            output = model(**inputs)
+
+            # Predicted Class probabilities
+            proba = output.logits.softmax(1)
+
+            # Predicted Classes
+            pred = int(proba.argmax(1)[0].int())
+            preds.append( model.config.id2label[pred])
+        return preds
+    except Exception as e:
+        fastdup_capture_exception("Age label", e)
+        return [MISSING_LABEL] * len(filenames)
@@ -47,7 +47,7 @@
 
 DEFUALT_METRIC_ZERO = 0
 DEFAULT_METRIC_MINUS_ONE = -1
-VERSION__ = "0.912"
+VERSION__ = "0.927"
 
 GITHUB_URL = "https://github.com/visual-layer/fastdup/issues"
 
@@ -72,6 +72,96 @@
 SELECTION_STRATEGY_UNIFORM_METRIC = 2
 
 YOLOV5S_MODEL = "https://github.com/itsnine/yolov5-onnxruntime/raw/master/models/yolov5s.onnx"
+DINOV2S_MODEL = "https://vl-company-website.s3.us-east-2.amazonaws.com/model_artifacts/dinov2/dinov2_vits14.onnx"
+DINOV2S_MODEL_DIM = 384
+DINOV2B_MODEL = "https://vl-company-website.s3.us-east-2.amazonaws.com/model_artifacts/dinov2/dinov2_vitb14.onnx"
+DINOV2B_MODEL_DIM = 768
+
+CAPTION_MODEL1_NAME = 'automatic'
+CAPTION_MODEL2_NAME = 'blip'
+CAPTION_MODEL3_NAME = 'blip2'
+VQA_MODEL1_NAME = "indoors_outdoors"
+AGE_LABEL1_NAME = 'age'
+
+# dtypes
+IMG = 'image'
+BBOX = 'bbox'
+
+# run modes
+MODE_DEFAULT = 'default'
+MODE_EMBEDDING = 'embedding'
+MODE_CROP = 'crop'
+MODE_ROTATED_BBOX = 'rotated'
+
+# fastdup files
+SPLITS_CSV = 'splits_found.json'
+MAPPING_CSV = 'atrain_features.dat.csv'
+BAD_CSV = 'atrain_features.bad.csv'
+FEATURES_DATA = 'atrain_features.dat'
+STATS_CSV = 'atrain_stats.csv'
+CONFIG_JSON = 'config.json'
+CROPS_CSV = 'atrain_crops.csv'
+
+# extra files
+BBOX_INPUT_CSV = 'objects_annot_fastdup_input.csv'
+ANNOT_PKL = 'full_annot.pkl.gz'
+IMG_GRP_ANNOT_PKL = 'img_grouped_annot.pkl.gz'
+
+# annotation expected columns
+ANNOT_FILENAME = 'filename'
+ANNOT_CROP_FILENAME = 'crop_filename'
+ANNOT_IMG_ID = 'img_id'
+ANNOT_IMG_H = 'img_h'
+ANNOT_IMG_W = 'img_w'
+ANNOT_BBOX_X = 'col_x'
+ANNOT_BBOX_Y = 'row_y'
+ANNOT_BBOX_W = 'width'
+ANNOT_BBOX_H = 'height'
+ANNOT_ROT_BBOX_X1 = 'x1'
+ANNOT_ROT_BBOX_Y1 = 'y1'
+ANNOT_ROT_BBOX_X2 = 'x2'
+ANNOT_ROT_BBOX_Y2 = 'y2'
+ANNOT_ROT_BBOX_X3 = 'x3'
+ANNOT_ROT_BBOX_Y3 = 'y3'
+ANNOT_ROT_BBOX_X4 = 'x4'
+ANNOT_ROT_BBOX_Y4 = 'y4'
+ANNOT_SPLIT = 'split'
+ANNOT_ERROR = 'error_code'
+ANNOT_LABEL = 'label'
+
+# extended annotation columns
+ANNOT_VALID = 'is_valid'
+ANNOT_FD_ID = 'index'
+
+# Connected components columns
+CC_INST_ID = '__id'
+CC_UNI_SPLIT = 'uni_split'
+CC_BI_SPLIT = 'bi_split'
+
+# bad files columns
+BAD_FILENAME = 'filename'
+BAD_ERROR = 'error_code'
+BAD_FD_ID = 'index'
+
+# similarity columns
+SIM_SRC_IMG = 'from'
+SIM_DST_IMG = 'to'
+SIM_SCORE = 'distance'
+
+# outliers columns
+OUT_ID = 'outlier'
+OUT_NEAREST_NEIGHBOR = 'nearest'
+OUT_SCORE = 'distance'
+
+# stats columns
+STATS_INST_ID = 'index'
+
+# map file columns
+MAP_INST_ID = 'index'
+MAP_FILENAME = 'filename'
+
+ERROR_MISSING_IMAGE = 'ERROR_MISSING_FILE'
+ERROR_BAD_BOUNDING_BOX = 'ERROR_BAD_BOUNDING_BOX'
 
 def get_sep():
     return os.sep
@@ -83,7 +83,7 @@ def run(self,
                 - image_filename: {Mandatory}. Relative path to the image wtr to input_dir
                 - split: (Optional). 'train' or 'test'
                 - label: (Optional). Class of the image
-                - bbox_x, bbox_y, bbox_h, bbox_w: (Optional). Bounding box of the object in the image
+                - row_y, col_x, height, width: (Optional). Bounding box of the object in the image
                     if provided, fastdup will run on the bounding box instead of the whole image
                 - x1, y1, x2, y2, x3, y3, x4, y4: (Optional). Bounding box of the object in the image
                     if provided, and bounding_box=='rotated_bbox' fastdup will run on the rotated bounding box.
@@ -148,8 +148,9 @@ def run(self,
                                    verbose=verbose,
                                    license='' if license is None else license,
                                    high_accuracy=high_accuracy)
-        if model_path is not None:
-            assert 'd' in kwargs, 'Please provide d parameter to indicate the model output dimension'
+        if (model_path is not None):
+            if 'dinov2s' not in model_path and 'dinov2b' not in model_path:
+                assert 'd' in kwargs, 'Please provide d parameter to indicate the model output dimension'
             fastdup_func_params['model_path'] = model_path
         fastdup_func_params.update(kwargs)