Skip to content

Commit 3595ebf

Browse files
committed
updating latest code for 0.211
1 parent 614fbbe commit 3595ebf

14 files changed

+1069
-441
lines changed

fastdup/__init__.py

Lines changed: 112 additions & 37 deletions
Large diffs are not rendered by default.

fastdup/cvat.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import cv2
1818
import shutil
1919
import zipfile
20+
from fastdup.image import get_shape
2021

2122
MANIFEST_FILE = 'data/manifest.jsonl'
2223
INDEX_FILE = 'data/index.json'
@@ -83,7 +84,7 @@ def create_annotations_file(files, labels, save_path):
8384
assert os.path.exists(f), "Faied to find path " + str(f)
8485
img = cv2.imread(f)
8586
assert img is not None, "Failed to read image" + str(f)
86-
h, w, c = img.shape
87+
h, w, c = get_shape(img)
8788
shape.append(
8889
{
8990
"type":"rectangle",
@@ -245,7 +246,7 @@ def create_cvat_manifest(files, save_path):
245246
for f in files:
246247
filename, ext = os.path.splitext(os.path.basename(f))
247248
img = cv2.imread(f)
248-
h, w, c = img.shape
249+
h, w, c = get_shape(img)
249250
cstr = "{"
250251
cstr += "\"name\":\"{}\",\"extension\":\".{}\",\"width\":{},\"height\":{}".format(filename, ext[1:], w, h)
251252
cstr += ",\"meta\":{\"related_images\":[]}}\n"

fastdup/definitions.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414
FILENAME_CONNECTED_COMPONENTS = "connected_components.csv"
1515
FILENAME_LABELS = "labels.csv"
1616
FILENAME_KMEANS_CENTROIDS = "kmeans_centroids.csv"
17+
FILENAME_CROP_LIST = "crops.csv"
1718
FILENAME_KMEANS_ASSIGNMENTS = "kmeans_assignments.csv"
1819
FILENAME_ERROR_MSG = "error.msg"
1920
FILENAME_DUPLICATES_HTML = "duplicates.html"
2021
FILENAME_OUTLIERS_HTML = "outliers.html"
2122
FILENAME_COMPONENTS_HTML = "components.html"
23+
FOLDER_FULL_IMAGE_RUN = "full_image_run"
2224

2325
IMAGELIST_HEADER="index,filename"
2426
LABEL_HEADER="index.label"
@@ -45,7 +47,7 @@
4547

4648
DEFUALT_METRIC_ZERO = 0
4749
DEFAULT_METRIC_MINUS_ONE = -1
48-
VERSION__ = "0.211"
50+
VERSION__ = "0.912"
4951

5052
GITHUB_URL = "https://github.com/visual-layer/fastdup/issues"
5153

fastdup/engine.py

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import Union, Iterable
44
import pandas as pd
55
from fastdup.fastdup_controller import FastdupController
6-
from fastdup.fastdup_galleries import FastdupVisualizer
6+
from fastdup.fastdup_visualizer import FastdupVisualizer
77

88

99
class Fastdup(FastdupController):
@@ -12,25 +12,21 @@ class Fastdup(FastdupController):
1212
Usage example
1313
=============
1414
15+
from fastdup.engine.Fastdup
1516
16-
# >>> from fastdup.engine.Fastdup
17-
# >>> import fastdup.fastdup_constants as FD
18-
# >>>
19-
# >>> annotation_csv = '/path/to/annotation.csv'
20-
# >>> data_dir = '/path/to/images/'
21-
# >>> output_dir = '/path/to/fastdup_analysis'
22-
# >>>
23-
# >>> fdp = FastDupProxy(work_dir=output_dir)
24-
# >>> fdp.run(input_dir=data_dir,
25-
# >>> df_annot=pd.read_csv(annotation_csv))
26-
# >>>
27-
# >>>
28-
# >>> df_sim = fdp.similarity(data=True)
29-
# >>> im1_id, im2_id, sim = df_sim.iloc[0]
30-
# >>> annot_im1, annot_im2 = fdp[im1_id], fdp[im2_id]
31-
# >>>
32-
# >>> df_cc, cc_info = fd.connected_components(data=True)
33-
# >>>
17+
annotation_csv = '/path/to/annotation.csv'
18+
data_dir = '/path/to/images/'
19+
output_dir = '/path/to/fastdup_analysis'
20+
21+
fd = Fastdup(work_dir=output_dir)
22+
fd.run(input_dir=data_dir, annotations=pd.read_csv(annotation_csv)
23+
24+
25+
df_sim = fd.similarity()
26+
im1_id, im2_id, sim = df_sim.iloc[0]
27+
annot_im1, annot_im2 = fd[im1_id], fd[im2_id]
28+
29+
df_cc, cc_info = fd.connected_components()
3430
"""
3531

3632
def __init__(self, work_dir: Union[str, Path], input_dir: Union[str, Path] = None):
@@ -42,7 +38,7 @@ def run(self,
4238
annotations: pd.DataFrame = None,
4339
embeddings=None,
4440
subset: list = None,
45-
data_type: str = 'infer',
41+
data_type: str = 'image',
4642
overwrite: bool = False,
4743
model_path=None,
4844
distance='cosine',
@@ -83,8 +79,20 @@ def run(self,
8379
run fastdup on local disk. Since copying images from s3 in a loop is very slow, Alternatively you can
8480
use the flag sync_s3_to_local=True to copy ahead all images on the remote s3 bucket to disk
8581
:param annotations: Optional dataframe with annotations.
82+
annotation dataframe should have the following columns:
83+
- image_filename: {Mandatory}. Relative path to the image wtr to input_dir
84+
- split: (Optional). 'train' or 'test'
85+
- label: (Optional). Class of the image
86+
- bbox_x, bbox_y, bbox_h, bbox_w: (Optional). Bounding box of the object in the image
87+
if provided, fastdup will run on the bounding box instead of the whole image
88+
- x1, y1, x2, y2, x3, y3, x4, y4: (Optional). Bounding box of the object in the image
89+
if provided, and bounding_box=='rotated_bbox' fastdup will run on the rotated bounding box.
90+
- additional columns can be added and will be added to the output dataframe
91+
92+
:param embeddings: list of embeddings, if given fastdup will be activated on the given embedding instead of the
93+
images. The embeddings should be in the same order as the images in the annotations dataframe.
8694
:param subset: List of images to run on. If None, run on all the images/bboxes.
87-
:param data_type: Type of data to run on. Supported types: 'image', 'bbox'. Default is 'infer'.
95+
:param data_type: Type of data to run on. Supported types: 'image', 'bbox'. Default is 'image'.
8896
:param model_path: path to model for feature extraction. supported formats: onnx, ort.
8997
Make sure to update d parameter acordingly.
9098
:param distance: - distance metric for the Nearest Neighbors algorithm.
@@ -99,13 +107,10 @@ def run(self,
99107
:param outlier_percentile: Percentile of the outlier score to use as threshold. Default is 0.5 (50%).
100108
:param threshold: Threshold to use for the graph generation. Default is 0.9.
101109
:param cc_threshold: Threshold to use for the graph connected component. Default is 0.96.
102-
:param bounding_box: Optional bounding box to crop images, given as TODO: internal bounding box or global bounding box?
103-
bounding_box='row_y=xx,col_x=xx,height=xx,width=xx'. This defines a global bounding box to be used
104-
for all images. Alternatively, it is possible to set bounding_box='face' to crop the face from the image
105-
(in case a face is present). For the face crop the margin around the face is defined by
106-
augmentation_horiz=0.2, augmentation_vert=0.2 where 0.2 mean 20% additional margin around the
107-
face relative to the width and height respectively. It is possible to change the margin,
108-
lower allower value is 0 (no margin) and upper allowed value is 1. Default is 0.2.
110+
:param bounding_box: yolov5s|face|retated_bbox
111+
- yolov5s: Use yolov5s to detect objects in the image and run fastdup on each object.
112+
- face: Use face detection to detect faces in the image and run fastdup on each face.
113+
- rotated_bbox: Use the rotated bounding given in annotation data-fram box to run fastdup on the object.
109114
:param num_threads: Number of threads. By default, autoconfigured by the number of cores.
110115
:param license: Optional license key. If not provided, only free features are available.
111116
:param overwrite: Optional flag to overwrite existing fastdup results.

0 commit comments

Comments
 (0)