33from typing import Union , Iterable
44import pandas as pd
55from fastdup .fastdup_controller import FastdupController
6- from fastdup .fastdup_galleries import FastdupVisualizer
6+ from fastdup .fastdup_visualizer import FastdupVisualizer
77
88
99class Fastdup (FastdupController ):
@@ -12,25 +12,21 @@ class Fastdup(FastdupController):
1212 Usage example
1313 =============
1414
15+ from fastdup.engine.Fastdup
1516
16- # >>> from fastdup.engine.Fastdup
17- # >>> import fastdup.fastdup_constants as FD
18- # >>>
19- # >>> annotation_csv = '/path/to/annotation.csv'
20- # >>> data_dir = '/path/to/images/'
21- # >>> output_dir = '/path/to/fastdup_analysis'
22- # >>>
23- # >>> fdp = FastDupProxy(work_dir=output_dir)
24- # >>> fdp.run(input_dir=data_dir,
25- # >>> df_annot=pd.read_csv(annotation_csv))
26- # >>>
27- # >>>
28- # >>> df_sim = fdp.similarity(data=True)
29- # >>> im1_id, im2_id, sim = df_sim.iloc[0]
30- # >>> annot_im1, annot_im2 = fdp[im1_id], fdp[im2_id]
31- # >>>
32- # >>> df_cc, cc_info = fd.connected_components(data=True)
33- # >>>
17+ annotation_csv = '/path/to/annotation.csv'
18+ data_dir = '/path/to/images/'
19+ output_dir = '/path/to/fastdup_analysis'
20+
21+ fd = Fastdup(work_dir=output_dir)
22+ fd.run(input_dir=data_dir, annotations=pd.read_csv(annotation_csv)
23+
24+
25+ df_sim = fd.similarity()
26+ im1_id, im2_id, sim = df_sim.iloc[0]
27+ annot_im1, annot_im2 = fd[im1_id], fd[im2_id]
28+
29+ df_cc, cc_info = fd.connected_components()
3430 """
3531
3632 def __init__ (self , work_dir : Union [str , Path ], input_dir : Union [str , Path ] = None ):
@@ -42,7 +38,7 @@ def run(self,
4238 annotations : pd .DataFrame = None ,
4339 embeddings = None ,
4440 subset : list = None ,
45- data_type : str = 'infer ' ,
41+ data_type : str = 'image ' ,
4642 overwrite : bool = False ,
4743 model_path = None ,
4844 distance = 'cosine' ,
@@ -83,8 +79,20 @@ def run(self,
8379 run fastdup on local disk. Since copying images from s3 in a loop is very slow, Alternatively you can
8480 use the flag sync_s3_to_local=True to copy ahead all images on the remote s3 bucket to disk
8581 :param annotations: Optional dataframe with annotations.
82+ annotation dataframe should have the following columns:
83+ - image_filename: {Mandatory}. Relative path to the image wtr to input_dir
84+ - split: (Optional). 'train' or 'test'
85+ - label: (Optional). Class of the image
86+ - bbox_x, bbox_y, bbox_h, bbox_w: (Optional). Bounding box of the object in the image
87+ if provided, fastdup will run on the bounding box instead of the whole image
88+ - x1, y1, x2, y2, x3, y3, x4, y4: (Optional). Bounding box of the object in the image
89+ if provided, and bounding_box=='rotated_bbox' fastdup will run on the rotated bounding box.
90+ - additional columns can be added and will be added to the output dataframe
91+
92+ :param embeddings: list of embeddings, if given fastdup will be activated on the given embedding instead of the
93+ images. The embeddings should be in the same order as the images in the annotations dataframe.
8694 :param subset: List of images to run on. If None, run on all the images/bboxes.
87- :param data_type: Type of data to run on. Supported types: 'image', 'bbox'. Default is 'infer '.
95+ :param data_type: Type of data to run on. Supported types: 'image', 'bbox'. Default is 'image '.
8896 :param model_path: path to model for feature extraction. supported formats: onnx, ort.
8997 Make sure to update d parameter acordingly.
9098 :param distance: - distance metric for the Nearest Neighbors algorithm.
@@ -99,13 +107,10 @@ def run(self,
99107 :param outlier_percentile: Percentile of the outlier score to use as threshold. Default is 0.5 (50%).
100108 :param threshold: Threshold to use for the graph generation. Default is 0.9.
101109 :param cc_threshold: Threshold to use for the graph connected component. Default is 0.96.
102- :param bounding_box: Optional bounding box to crop images, given as TODO: internal bounding box or global bounding box?
103- bounding_box='row_y=xx,col_x=xx,height=xx,width=xx'. This defines a global bounding box to be used
104- for all images. Alternatively, it is possible to set bounding_box='face' to crop the face from the image
105- (in case a face is present). For the face crop the margin around the face is defined by
106- augmentation_horiz=0.2, augmentation_vert=0.2 where 0.2 mean 20% additional margin around the
107- face relative to the width and height respectively. It is possible to change the margin,
108- lower allower value is 0 (no margin) and upper allowed value is 1. Default is 0.2.
110+ :param bounding_box: yolov5s|face|retated_bbox
111+ - yolov5s: Use yolov5s to detect objects in the image and run fastdup on each object.
112+ - face: Use face detection to detect faces in the image and run fastdup on each face.
113+ - rotated_bbox: Use the rotated bounding given in annotation data-fram box to run fastdup on the object.
109114 :param num_threads: Number of threads. By default, autoconfigured by the number of cores.
110115 :param license: Optional license key. If not provided, only free features are available.
111116 :param overwrite: Optional flag to overwrite existing fastdup results.
0 commit comments