fix: align pre and pose process of doclayout

SWHL · SWHL · commit faa09cfa1ade · 2024-12-23T23:11:42.000+08:00
diff --git a/1.jpg b/1.jpg
diff --git a/demo.py b/demo.py
@@ -5,9 +5,9 @@
 
 from rapid_layout import RapidLayout, VisLayout
 
-layout_engine = RapidLayout(model_type="doclayout_yolo", conf_thres=0.1)
+layout_engine = RapidLayout(model_type="doclayout_yolo", conf_thres=0.2)
 
-img_path = "tests/test_files/PMC3576793_00004.jpg"
+img_path = "1.jpg"
 img = cv2.imread(img_path)
 
 boxes, scores, class_names, elapse = layout_engine(img)
diff --git a/rapid_layout/main.py b/rapid_layout/main.py
@@ -35,7 +35,7 @@
     "yolov8n_layout_report": f"{ROOT_URL}/yolov8n_layout_report.onnx",
     "yolov8n_layout_publaynet": f"{ROOT_URL}/yolov8n_layout_publaynet.onnx",
     "yolov8n_layout_general6": f"{ROOT_URL}/yolov8n_layout_general6.onnx",
-    "doclayout_yolo": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024.onnx",
+    "doclayout_yolo": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024_meta.onnx",
 }
 DEFAULT_MODEL_PATH = str(ROOT_DIR / "models" / "layout_cdla.onnx")
 
diff --git a/rapid_layout/utils/augment.py b/rapid_layout/utils/augment.py
@@ -0,0 +1,85 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+import cv2
+import numpy as np
+
+
+class LetterBox:
+    """Resize image and padding for detection, instance segmentation, pose."""
+
+    def __init__(
+        self,
+        new_shape=(640, 640),
+        auto=False,
+        scaleFill=False,
+        scaleup=True,
+        center=True,
+        stride=32,
+    ):
+        """Initialize LetterBox object with specific parameters."""
+        self.new_shape = new_shape
+        self.auto = auto
+        self.scaleFill = scaleFill
+        self.scaleup = scaleup
+        self.stride = stride
+        self.center = center  # Put the image in the middle or top-left
+
+    def __call__(self, labels=None, image=None):
+        """Return updated labels and image with added border."""
+        if labels is None:
+            labels = {}
+        img = labels.get("img") if image is None else image
+        shape = img.shape[:2]  # current shape [height, width]
+        new_shape = labels.pop("rect_shape", self.new_shape)
+        if isinstance(new_shape, int):
+            new_shape = (new_shape, new_shape)
+
+        # Scale ratio (new / old)
+        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+        if not self.scaleup:  # only scale down, do not scale up (for better val mAP)
+            r = min(r, 1.0)
+
+        # Compute padding
+        ratio = r, r  # width, height ratios
+        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+        dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+        if self.auto:  # minimum rectangle
+            dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride)  # wh padding
+        elif self.scaleFill:  # stretch
+            dw, dh = 0.0, 0.0
+            new_unpad = (new_shape[1], new_shape[0])
+            ratio = (
+                new_shape[1] / shape[1],
+                new_shape[0] / shape[0],
+            )  # width, height ratios
+
+        if self.center:
+            dw /= 2  # divide padding into 2 sides
+            dh /= 2
+
+        if shape[::-1] != new_unpad:  # resize
+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+        top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
+        left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
+        )  # add border
+        if labels.get("ratio_pad"):
+            labels["ratio_pad"] = (labels["ratio_pad"], (left, top))  # for evaluation
+
+        if len(labels):
+            labels = self._update_labels(labels, ratio, dw, dh)
+            labels["img"] = img
+            labels["resized_shape"] = new_shape
+            return labels
+        else:
+            return img
+
+    def _update_labels(self, labels, ratio, padw, padh):
+        """Update labels."""
+        labels["instances"].convert_bbox(format="xyxy")
+        labels["instances"].denormalize(*labels["img"].shape[:2][::-1])
+        labels["instances"].scale(*ratio)
+        labels["instances"].add_padding(padw, padh)
+        return labels
diff --git a/rapid_layout/utils/post_prepross.py b/rapid_layout/utils/post_prepross.py
@@ -1,6 +1,7 @@
 # -*- encoding: utf-8 -*-
 # @Author: SWHL
 # @Contact: liekkaskono@163.com
+import re
 from typing import List, Tuple
 
 import numpy as np
@@ -299,7 +300,7 @@ def extract_boxes(self, predictions):
 
 
 class DocLayoutPostProcess:
-    def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5):
+    def __init__(self, labels: List[str], conf_thres=0.2, iou_thres=0.5):
         self.labels = labels
         self.conf_threshold = conf_thres
         self.iou_threshold = iou_thres
@@ -308,40 +309,67 @@ def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5):
 
     def __call__(
         self,
-        output,
+        preds,
         ori_img_shape: Tuple[int, int],
         img_shape: Tuple[int, int] = (1024, 1024),
     ):
-        self.img_height, self.img_width = ori_img_shape
-        self.input_height, self.input_width = img_shape
-
-        output = output[0].squeeze()
-        boxes = output[:, :-2]
-        confidences = output[:, -2]
-        class_ids = output[:, -1].astype(int)
-
-        mask = confidences > self.conf_threshold
-        boxes = boxes[mask, :]
-        confidences = confidences[mask]
-        class_ids = class_ids[mask]
-
-        # Rescale boxes to original image dimensions
-        boxes = rescale_boxes(
-            boxes,
-            self.input_width,
-            self.input_height,
-            self.img_width,
-            self.img_height,
-        )
+        preds = preds[0]
+        mask = preds[..., 4] > self.conf_threshold
+        preds = [p[mask[idx]] for idx, p in enumerate(preds)][0]
+        preds[:, :4] = scale_boxes(list(img_shape), preds[:, :4], list(ori_img_shape))
+
+        boxes = preds[:, :4]
+        confidences = preds[:, 4]
+        class_ids = preds[:, 5].astype(int)
         labels = [self.labels[i] for i in class_ids]
         return boxes, confidences, labels
 
 
-def rescale_boxes(boxes, input_width, input_height, img_width, img_height):
-    # Rescale boxes to original image dimensions
-    input_shape = np.array([input_width, input_height, input_width, input_height])
-    boxes = np.divide(boxes, input_shape, dtype=np.float32)
-    boxes *= np.array([img_width, img_height, img_width, img_height])
+def scale_boxes(
+    img1_shape, boxes, img0_shape, ratio_pad=None, padding=True, xywh=False
+):
+    """
+    Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
+    specified in (img1_shape) to the shape of a different image (img0_shape).
+
+    Args:
+        img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
+        boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
+        img0_shape (tuple): the shape of the target image, in the format of (height, width).
+        ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
+            calculated based on the size difference between the two images.
+        padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
+            rescaling.
+        xywh (bool): The box format is xywh or not, default=False.
+
+    Returns:
+        boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
+    """
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(
+            img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]
+        )  # gain  = old / new
+        pad = (
+            round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1),
+            round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1),
+        )  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    if padding:
+        boxes[..., 0] -= pad[0]  # x padding
+        boxes[..., 1] -= pad[1]  # y padding
+        if not xywh:
+            boxes[..., 2] -= pad[0]  # x padding
+            boxes[..., 3] -= pad[1]  # y padding
+    boxes[..., :4] /= gain
+    return clip_boxes(boxes, img0_shape)
+
+
+def clip_boxes(boxes, shape):
+    boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
+    boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2
     return boxes
 
 
diff --git a/rapid_layout/utils/pre_procss.py b/rapid_layout/utils/pre_procss.py
@@ -7,6 +7,8 @@
 import cv2
 import numpy as np
 
+from .augment import LetterBox
+
 InputType = Union[str, np.ndarray, bytes, Path]
 
 
@@ -57,11 +59,15 @@ class DocLayoutPreProcess:
 
     def __init__(self, img_size: Tuple[int, int]):
         self.img_size = img_size
+        self.letterbox = LetterBox(new_shape=img_size, auto=False, stride=32)
 
     def __call__(self, image: np.ndarray) -> np.ndarray:
-        input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        input_img = cv2.resize(image, self.img_size)
-        input_img = input_img / 255.0
-        input_img = input_img.transpose(2, 0, 1)
-        input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
+        print(image.shape)
+        input_img = self.letterbox(image=image)
+        print(input_img.shape)
+        input_img = input_img[None, ...]
+        input_img = input_img[..., ::-1].transpose(0, 3, 1, 2)
+        input_img = np.ascontiguousarray(input_img)
+        input_img = input_img / 255
+        input_tensor = input_img.astype(np.float32)
         return input_tensor

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`	`"yolov8n_layout_report": f"{ROOT_URL}/yolov8n_layout_report.onnx",`
`36`	`36`	`"yolov8n_layout_publaynet": f"{ROOT_URL}/yolov8n_layout_publaynet.onnx",`
`37`	`37`	`"yolov8n_layout_general6": f"{ROOT_URL}/yolov8n_layout_general6.onnx",`
`38`		`- "doclayout_yolo": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024.onnx",`
	`38`	`+ "doclayout_yolo": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024_meta.onnx",`
`39`	`39`	`}`
`40`	`40`	`DEFAULT_MODEL_PATH = str(ROOT_DIR / "models" / "layout_cdla.onnx")`
`41`	`41`