add classification and object det

dnth · web-flow · commit 0122b1e09b20 · 2023-06-03T13:22:47.000+08:00
diff --git a/.github/workflows/examples-ci.yml b/.github/workflows/examples-ci.yml
@@ -83,4 +83,83 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: fastdup_work_dir_cleaning_image_dataset
+          path: fastdup_work_dir/
+
+  test-labeled-image-classification:
+    runs-on: ${{ matrix.os }}
+    env:
+        SENTRY_OPT_OUT: True
+    strategy:
+        matrix:
+            os: [ubuntu-latest]
+            python-version: ['3.9']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+            fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install fastdup matplotlib
+
+      - name: Download dataset
+        run: |
+            wget https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz
+            tar -xf imagenette2-160.tgz
+
+      - name: Run example
+        run: |
+          python .github/workflows/tests/labeled_image_classification.py
+      
+      - name: Save artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: fastdup_work_dir_labeled_image_classification
+          path: fastdup_work_dir/
+
+  test-labeled-object-detection:
+    runs-on: ${{ matrix.os }}
+    env:
+        SENTRY_OPT_OUT: True
+    strategy:
+        matrix:
+            os: [ubuntu-latest]
+            python-version: ['3.9']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+            fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install fastdup fastdup plotly gdown
+
+      - name: Download dataset
+        run: |
+            gdown --fuzzy https://drive.google.com/file/d/1iSXVTlkV1_DhdYpVDqsjlT4NJFQ7OkyK/view
+            unzip -qq coco_minitrain_25k.zip
+            cd coco_minitrain_25k/annotations && gdown --fuzzy https://drive.google.com/file/d/1i12p23cXlqp1QrXjAD_vu467r4q67Mq9/view
+
+      - name: Run example
+        run: |
+          python .github/workflows/tests/labeled_object_detection.py
+      
+      - name: Save artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: fastdup_work_dir_labeled_object_detection
           path: fastdup_work_dir/
diff --git a/.github/workflows/tests/labeled_image_classification.py b/.github/workflows/tests/labeled_image_classification.py
@@ -0,0 +1,50 @@
+import pandas as pd
+data_dir = 'imagenette2-160/'
+csv_path = 'imagenette2-160/noisy_imagenette.csv'
+
+label_map = {
+    'n02979186': 'cassette_player', 
+    'n03417042': 'garbage_truck', 
+    'n01440764': 'tench', 
+    'n02102040': 'English_springer', 
+    'n03028079': 'church',
+    'n03888257': 'parachute', 
+    'n03394916': 'French_horn', 
+    'n03000684': 'chain_saw', 
+    'n03445777': 'golf_ball', 
+    'n03425413': 'gas_pump'
+}
+
+df_annot = pd.read_csv(csv_path)
+# take relevant columns
+df_annot = df_annot[['path', 'noisy_labels_0']]
+
+# rename columns to fastdup's column names
+df_annot = df_annot.rename({'noisy_labels_0': 'label', 'path': 'filename'}, axis='columns')
+
+# append datadir
+df_annot['filename'] = df_annot['filename'].apply(lambda x: data_dir + x)
+
+# create split column
+df_annot['split'] = df_annot['filename'].apply(lambda x: x.split("/")[1])
+
+# map label ids to regular labels
+df_annot['label'] = df_annot['label'].map(label_map)
+
+
+import fastdup
+print(f'fastdup version: {fastdup.__version__}')
+
+work_dir = 'fastdup_imagenette'
+fd = fastdup.create(work_dir=work_dir, input_dir=data_dir) 
+fd.run(annotations=df_annot, ccthreshold=0.9, threshold=0.8)
+
+fd.vis.duplicates_gallery(num_images=5)
+fd.vis.component_gallery(num_images=5)
+fd.vis.component_gallery(slice='chain_saw')
+fd.vis.outliers_gallery(num_images=5)
+fd.vis.similarity_gallery() 
+
+fd.vis.stats_gallery(metric='dark', num_images=5)
+fd.vis.stats_gallery(metric='bright', num_images=5)
+fd.vis.stats_gallery(metric='blur', num_images=5)
diff --git a/.github/workflows/tests/labeled_object_detection.py b/.github/workflows/tests/labeled_object_detection.py
@@ -0,0 +1,21 @@
+import fastdup
+print(f'fastdup version: {fastdup.__version__}')
+
+import pandas as pd
+coco_csv = 'coco_minitrain_25k/annotations/coco_minitrain2017.csv'
+coco_annotations = pd.read_csv(coco_csv, header=None, names=['filename', 'col_x', 'row_y',
+                                                             'width', 'height', 'label', 'ext'])
+
+coco_annotations['split'] = 'train'  # Only train files were loaded
+coco_annotations['filename'] = coco_annotations['filename'].apply(lambda x: 'coco_minitrain_25k/images/train2017/'+x)
+coco_annotations = coco_annotations.drop_duplicates()
+
+input_dir = '.'
+work_dir = 'fastdup_minicoco'
+
+fd = fastdup.create(work_dir=work_dir, input_dir=input_dir)
+fd.run(annotations=coco_annotations, overwrite=True, num_images=10000)
+
+fd.vis.component_gallery(metric='size', max_width=900)
+fd.vis.outliers_gallery()
+fd.vis.component_gallery(num_images=25, slice='diff')