detection dataset: Allow more than one class per sample

JeansBoussier · ladaapp · commit 665586393b09 · 2025-11-09T20:54:27.000+01:00
diff --git a/lada/bpjdet/inference.py b/lada/bpjdet/inference.py
@@ -112,8 +112,9 @@ def _post_process_batch(data, imgs, paths, shapes, body_dets, part_dets):
 
     return batch_bboxes, batch_points, batch_scores, batch_imgids, batch_parts_dict, img_indexs
 
-def get_model(device: str, weights_path: str):
+def get_model(device: str):
     torch_device = torch.device(device)
+    weights_path = os.path.join(MODEL_WEIGHTS_DIR, '3rd_party', 'ch_head_s_1536_e150_best_mMR.pt')
     return attempt_load(weights_path, map_location=torch_device)
 
 def inference(model, image_path, imgz, data, conf_thres=0.45, iou_thres=0.75) -> list[Box]:
diff --git a/lada/lib/__init__.py b/lada/lib/__init__.py
@@ -50,7 +50,7 @@ class VideoMetadata:
 
 @dataclass
 class Detection:
-    cls: str
+    cls: int
     box: Box
     mask: Mask # Binary segmentation mask. Values can be either 0 (background) or mask_val
 
@@ -67,7 +67,7 @@ class Detections:
 Mask value is anon-zero value used in binary mask (Mask) to indicate if pixel belongs to the class
 """
 DETECTION_CLASSES = {
-    "nsfw": dict(class_id=0, mask_value=255),
-    "sfw_head": dict(class_id=1, mask_value=127),
-    "sfw_face": dict(class_id=2, mask_value=192),
+    "nsfw": dict(cls=0, mask_value=255),
+    "sfw_head": dict(cls=1, mask_value=127),
+    "sfw_face": dict(cls=2, mask_value=192),
 }
diff --git a/lada/lib/box_utils.py b/lada/lib/box_utils.py
@@ -0,0 +1,6 @@
+from lada.lib import Box
+
+def box_overlap(box1: Box, box2: Box):
+    y1min, x1min, y1max, x1max = box1
+    y2min, x2min, y2max, x2max = box2
+    return x1min < x2max and x2min < x1max and y1min < y2max and y2min < y1max
diff --git a/lada/lib/mosaic_classifier.py b/lada/lib/mosaic_classifier.py
@@ -4,7 +4,7 @@
 from typing import Optional
 
 from lada.lib.ultralytics_utils import convert_yolo_boxes
-from lada.lib.scene_utils import box_overlap
+from lada.lib.box_utils import box_overlap
 from lada.lib import Image, Box
 from ultralytics import YOLO
 
diff --git a/lada/lib/mosaic_detector.py b/lada/lib/mosaic_detector.py
@@ -13,6 +13,7 @@
 from ultralytics.engine.results import Results
 from lada.lib import Box, Mask, Image, VideoMetadata, threading_utils
 from lada.lib import image_utils
+from lada.lib.box_utils import box_overlap
 from lada.lib.mosaic_detection_model import MosaicDetectionModel
 from lada.lib.scene_utils import crop_to_box_v3
 from lada.lib import video_utils
@@ -67,16 +68,11 @@ def get_masks(self):
     def get_boxes(self):
         return [box for _, _, box in self.data]
 
-    def box_overlaps(self, box1: Box, box2: Box) -> bool:
-        y_overlaps = (box1[0] <= box2[0] <= box1[2] or box1[0] <= box2[2] <= box1[2]) or (box2[0] <= box1[0] <= box2[2] or box2[0] <= box1[2] <= box2[2])
-        x_overlaps = (box1[1] <= box2[1] <= box1[3] or box1[1] <= box2[3] <= box1[3]) or (box2[1] <= box1[1] <= box2[3] or box2[1] <= box1[3] <= box2[3])
-        return y_overlaps and x_overlaps
-
     def belongs(self, box: Box):
         if len(self.data) == 0:
             return False
         last_scene_box = self.data[-1][2]
-        return self.box_overlaps(last_scene_box, box)
+        return box_overlap(last_scene_box, box)
 
     def __iter__(self):
         return self
diff --git a/lada/lib/nsfw_frame_detector.py b/lada/lib/nsfw_frame_detector.py
@@ -39,5 +39,5 @@ def __init__(self, model: ultralytics.models.YOLO, device=None, random_extend_ma
         self.conf = conf
 
     def detect(self, file_path: str) -> Detections | None:
-        for results in self.model.predict(source=file_path, stream=False, verbose=False, device=self.device, conf=0.4, iou=0.):
+        for results in self.model.predict(source=file_path, stream=False, verbose=False, device=self.device, conf=self.conf, iou=0.):
             return get_nsfw_frames(results, self.random_extend_masks)
diff --git a/lada/lib/nudenet_nsfw_detector.py b/lada/lib/nudenet_nsfw_detector.py
@@ -4,7 +4,7 @@
 from typing import Optional
 
 from lada.lib.ultralytics_utils import convert_yolo_boxes
-from lada.lib.scene_utils import box_overlap
+from lada.lib.box_utils import box_overlap
 from lada.lib import Image, Box
 from ultralytics import YOLO
 
diff --git a/lada/lib/scene_utils.py b/lada/lib/scene_utils.py
@@ -5,14 +5,6 @@
 
 from lada.lib import Box, Mask, Image
 
-def box_overlap(box1: Box, box2: Box):
-    t1, l1, b1, r1 = box1
-    t2, l2, b2, r2 = box2
-    t = max(t1, t2)
-    l = max(l1, l2)
-    b = min(b1, b2)
-    r = min(r1, r2)
-    return r > l and b > t
 
 def crop_to_box_v3(box: Box, img: Image, mask_img: Mask, target_size: tuple[int, int], max_box_expansion_factor=1.0, border_size=0):
     """
diff --git a/lada/lib/ultralytics_utils.py b/lada/lib/ultralytics_utils.py
@@ -77,85 +77,66 @@ def choose_biggest_detection(result: ultralytics.engine.results.Results, trackin
                 mask = yolo_mask
     return box, mask
 
-def convert_segment_masks_to_yolo_segmentation_labels(masks_dir, output_dir, pixel_to_class_mapping):
+def _get_unique_pixel_values(mask: Mask) -> list[int]:
+    # get unique values except background (0)
+    unique_values = np.unique(mask).tolist()
+    if 0 in unique_values: unique_values.remove(0)  # remove background class
+    return unique_values
+
+def convert_segment_masks_to_yolo_labels(masks_dir, output_dir_segmentation_labels, output_dir_detection_labels, pixel_to_class_mapping):
     """
     pixel_to_class_mapping is a dict providing a mapping from pixel value to class id.
     e.g. if you only have a single class with id 0 and binary masks use pixel value 255 then this would be:
     pixel_to_class_mapping = {255: 0}
 
-    source: ultralytics.data.converter.convert_segment_masks_to_yolo_seg
+    Based of: ultralytics.data.converter.convert_segment_masks_to_yolo_seg
     """
+    def get_yolo_box(contour) -> tuple[float]:
+        x, y, w, h = cv2.boundingRect(contour)
+        h, w = mask.shape[:2]
+        center_x = x + w / 2
+        center_y = y + h / 2
+        yolo_box = center_x / w, center_y / h, w / w, h / h
+        return yolo_box
+
     for mask_path in Path(masks_dir).iterdir():
         if mask_path.suffix in {".png", ".jpg"}:
             mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
             img_height, img_width = mask.shape
 
-            unique_values = np.unique(mask)  # Get unique pixel values representing different classes
-            yolo_format_data = []
+            unique_values = _get_unique_pixel_values(mask)
+            yolo_segmentation_format_data = []
+            yolo_detection_format_data = []
 
             for value in unique_values:
-                if value == 0:
-                    continue  # Skip background
                 class_index = pixel_to_class_mapping.get(value, -1)
                 if class_index == -1:
                     print(f"Unknown class for pixel value {value} in file {mask_path}, skipping.")
                     continue
 
                 # Create a binary mask for the current class and find contours
-                contours, _ = cv2.findContours(
-                    (mask == value).astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
-                )  # Find contours
+                binary_mask_for_current_class = (mask == value).astype(np.uint8)
+                contours, _ = cv2.findContours(binary_mask_for_current_class, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
                 for contour in contours:
                     if len(contour) >= 3:  # YOLO requires at least 3 points for a valid segmentation
                         contour = contour.squeeze()  # Remove single-dimensional entries
-                        yolo_format = [class_index]
+                        yolo_segmentation_format = [class_index]
                         for point in contour:
                             # Normalize the coordinates
-                            yolo_format.append(round(point[0] / img_width, 6))  # Rounding to 6 decimal places
-                            yolo_format.append(round(point[1] / img_height, 6))
-                        yolo_format_data.append(yolo_format)
+                            yolo_segmentation_format.append(round(point[0] / img_width, 6))  # Rounding to 6 decimal places
+                            yolo_segmentation_format.append(round(point[1] / img_height, 6))
+                        yolo_segmentation_format_data.append(yolo_segmentation_format)
+                        yolo_detection_format_data.append(get_yolo_box(contour))
+
             # Save Ultralytics YOLO format data to file
-            output_path = Path(output_dir) / f"{mask_path.stem}.txt"
+            output_path = Path(output_dir_segmentation_labels) / f"{mask_path.stem}.txt"
             with open(output_path, "w", encoding="utf-8") as file:
-                for item in yolo_format_data:
+                for item in yolo_segmentation_format_data:
+                    line = " ".join(map(str, item))
+                    file.write(line + "\n")
+            output_path = Path(output_dir_detection_labels) / f"{mask_path.stem}.txt"
+            with open(output_path, "w", encoding="utf-8") as file:
+                for item in yolo_detection_format_data:
                     line = " ".join(map(str, item))
                     file.write(line + "\n")
-
-
-def convert_binary_mask_to_yolo_detection_labels(masks_dir, output_dir, pixel_to_class_mapping):
-    """
-    pixel_to_class_mapping is a dict providing a mapping from pixel value to class id.
-    e.g. if you only have a single class with id 0 and binary masks use pixel value 255 then this would be:
-    pixel_to_class_mapping = {255: 0}
-
-    """
-
-    def _convert_binary_mask_to_yolo_detection_labels(mask: Mask) -> tuple[float]:
-        t, l, b, r = mask_utils.get_box(mask)
-        h, w = mask.shape[:2]
-        box_width = r - l
-        box_height = b - t
-        box_center_x = l + box_width / 2
-        box_center_y = t + box_height / 2
-        yolo_box = box_center_x / w, box_center_y / h, box_width / w, box_height / h
-        return yolo_box
-
-    def _get_class_id(mask: Mask) -> int:
-        unique_values = np.unique(mask).tolist()
-        if 0 in unique_values: unique_values.remove(0)  # remove background class
-        assert len(unique_values) == 1, f"only single class / binary segmentation mask supported but found these values: {unique_values}"
-        mask_val = unique_values[0]
-
-        class_id = pixel_to_class_mapping.get(mask_val, -1)
-        assert class_id != -1, f"Unknown class for pixel value {mask_val} in file {mask_path}"
-        return class_id
-
-    for mask_path in Path(masks_dir).iterdir():
-        if mask_path.suffix in {".png", ".jpg"}:
-            mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
-            class_id = _get_class_id(mask)
-            yolo_box = _convert_binary_mask_to_yolo_detection_labels(mask)
-            label_file_path = Path(output_dir).joinpath(Path(mask_path).with_suffix('.txt').name)
-            with open(label_file_path, 'a') as file:
-                file.write(f"{class_id} {yolo_box[0]} {yolo_box[1]} {yolo_box[2]} {yolo_box[3]}")
diff --git a/lada/lib/watermark_detector.py b/lada/lib/watermark_detector.py
@@ -4,7 +4,7 @@
 from typing import Optional
 
 from lada.lib.ultralytics_utils import convert_yolo_boxes
-from lada.lib.scene_utils import box_overlap
+from lada.lib.box_utils import box_overlap
 from lada.lib import Image, Box
 from ultralytics import YOLO
 
diff --git a/scripts/dataset_creation/create-mosaic-detection-dataset.py b/scripts/dataset_creation/create-mosaic-detection-dataset.py
@@ -13,11 +13,12 @@
 from lada.centerface.centerface import CenterFace
 import lada.bpjdet.inference as bpjdet
 from lada.lib import visualization_utils, image_utils, transforms as lada_transforms, Detections, DETECTION_CLASSES
+from lada.lib.box_utils import box_overlap
 from lada.lib.face_detector import FaceDetector
 from lada.lib.head_detector import HeadDetector
 from lada.lib.nsfw_frame_detector import NsfwImageDetector
 from lada.lib.threading_utils import clean_up_completed_futures
-from lada.lib.ultralytics_utils import convert_binary_mask_to_yolo_detection_labels, convert_segment_masks_to_yolo_segmentation_labels
+from lada.lib.ultralytics_utils import convert_segment_masks_to_yolo_labels
 
 from torchvision.transforms import transforms as torchvision_transforms
 
@@ -85,8 +86,68 @@ def create_degradation_pipeline(hq_img, target_size, mosaic_size, device='cuda')
                                          bitrate_ranges={}),
     ])
 
-def process_image_file(file_path, output_root, detector: NsfwImageDetector | FaceDetector | HeadDetector, device='cpu', show=False, window_name="mosaic"):
-    detections: Detections = detector.detect(file_path)
+def get_detections(file_path, detectors: list[NsfwImageDetector | FaceDetector | HeadDetector]) -> Detections:
+    detections = []
+    nsfw_detections = []
+    sfw_detections = []
+    frame = None
+
+    for detector in detectors:
+        _detections = detector.detect(file_path)
+        if _detections is None:
+            continue
+        if frame is None:
+            frame = _detections.frame
+        if isinstance(detector, NsfwImageDetector):
+            nsfw_detections.extend(_detections.detections)
+        else:
+            sfw_detections.extend(_detections.detections)
+
+    skip = []
+    def get_non_skipped(detections):
+        non_skipped = []
+        for det in detections:
+            skip_det = False
+            for skipped_det in skip:
+                if skipped_det is det:
+                    skip_det = True
+                    break
+            if not skip_det:
+                non_skipped.append(det)
+        return non_skipped
+
+    for sfw_detection in sfw_detections:
+        should_skip = False
+        for nsfw_detection in nsfw_detections:
+            if box_overlap(sfw_detection.box, nsfw_detection.box):
+                skip.append(sfw_detection)
+                should_skip = True
+                break
+        if should_skip: continue
+        for _sfw_detection in get_non_skipped(sfw_detections):
+            if _sfw_detection is sfw_detection:
+                continue
+            if box_overlap(sfw_detection.box, _sfw_detection.box):
+                skip.append(sfw_detection)
+                should_skip = True
+                break
+        if should_skip: continue
+        detections.append(sfw_detection)
+    for nsfw_detection in nsfw_detections:
+        should_skip = False
+        for _nsfw_detection in get_non_skipped(nsfw_detections):
+            if _nsfw_detection is nsfw_detection:
+                continue
+            if box_overlap(nsfw_detection.box, _nsfw_detection.box):
+                skip.append(nsfw_detection)
+                should_skip = True
+                break
+        if should_skip: continue
+        detections.append(nsfw_detection)
+    return Detections(frame, detections)
+
+def process_image_file(file_path, output_root, detectors: list[NsfwImageDetector | FaceDetector | HeadDetector], device='cpu', show=False, window_name="mosaic"):
+    detections: Detections = get_detections(file_path, detectors)
     if not detections or len(detections.detections) == 0:
         if not show:
             name = osp.splitext(os.path.basename(file_path))[0]
@@ -144,7 +205,7 @@ def parse_args():
     parser.add_argument('--output-root', type=Path, help="directory where resulting images/masks are saved")
     parser.add_argument('--input-root', type=Path, help="directory containing image files")
     parser.add_argument('--device', type=str, default="cuda:0")
-    parser.add_argument('--model', type=str, default="model_weights/lada_nsfw_detection_model_v1.3.pt", help="path to YOLO model")
+    parser.add_argument('--model', type=str, default="model_weights/lada_nsfw_detection_model_v1.3.pt", help="path to NSFW detection model")
     parser.add_argument('--workers', type=int, default=4, help="number of worker threads")
     parser.add_argument('--start-index', type=int, default=0, help="Can be used to continue a previous run. Note the index number next to last processed file name")
     parser.add_argument('--show', default=False, action=argparse.BooleanOptionalAction, help="show each sample")
@@ -159,21 +220,21 @@ def parse_args():
 def main():
     args = parse_args()
 
-    detector = None
+    detectors = []
     if args.create_nsfw_mosaics:
         model = YOLO(args.model)
-        detector = NsfwImageDetector(model, args.device, random_extend_masks=True, conf=0.8)
-    elif args.create_sfw_face_mosaics:
+        detectors.append(NsfwImageDetector(model, args.device, random_extend_masks=True, conf=0.8))
+    if args.create_sfw_face_mosaics:
         model = CenterFace()
-        detector = FaceDetector(model, random_extend_masks=True, conf=0.8)
-    elif args.create_sfw_head_mosaics:
-        model = bpjdet.get_model(device=args.device, weights_path=args.model)
+        detectors.append(FaceDetector(model, random_extend_masks=True, conf=0.8))
+    if args.create_sfw_head_mosaics:
+        model = bpjdet.get_model(device=args.device)
         data = bpjdet.JointBP_CrowdHuman_head.DATA
         data['conf_thres_part'] = 0.7
         data['iou_thres_part'] = 0.7
         data['match_iou_thres'] = 0.7
-        detector = HeadDetector(model, data=data, random_extend_masks=True, conf_thres=data['conf_thres_part'], iou_thres=data['iou_thres_part'])
-    assert detector is not None
+        detectors.append(HeadDetector(model, data=data, random_extend_masks=True, conf_thres=data['conf_thres_part'], iou_thres=data['iou_thres_part']))
+    assert len(detectors) > 0
 
     if not args.show:
         os.makedirs(f"{args.output_root}/masks", exist_ok=True)
@@ -195,9 +256,9 @@ def main():
                 continue
             print(f"{file_idx}, Processing {file_path.name}")
             if args.show:
-                process_image_file(file_path, args.output_root, detector, device=args.device, show=True)
+                process_image_file(file_path, args.output_root, detectors, device=args.device, show=True)
             else:
-                jobs.append(executor.submit(process_image_file, file_path, args.output_root, detector, args.device))
+                jobs.append(executor.submit(process_image_file, file_path, args.output_root, detectors, args.device))
                 clean_up_completed_futures(jobs)
     wait(jobs, return_when=ALL_COMPLETED)
     clean_up_completed_futures(jobs)
@@ -211,8 +272,7 @@ def main():
             DETECTION_CLASSES["sfw_face"]["mask_value"]: 1,
             DETECTION_CLASSES["sfw_head"]["mask_value"]: 1
         }
-        convert_segment_masks_to_yolo_segmentation_labels(f"{args.output_root}/masks", f"{args.output_root}/segmentation_labels", pixel_to_class_mapping)
-        convert_binary_mask_to_yolo_detection_labels(f"{args.output_root}/masks", f"{args.output_root}/detection_labels", pixel_to_class_mapping)
+        convert_segment_masks_to_yolo_labels(f"{args.output_root}/masks", f"{args.output_root}/segmentation_labels", f"{args.output_root}/detection_labels", pixel_to_class_mapping)
 
     if args.show:
         cv2.destroyAllWindows()