Skip to content

Commit faa09cf

Browse files
committed
fix: align pre and pose process of doclayout
1 parent 8e96773 commit faa09cf

File tree

6 files changed

+155
-36
lines changed

6 files changed

+155
-36
lines changed

1.jpg

51.2 KB
Loading

demo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55

66
from rapid_layout import RapidLayout, VisLayout
77

8-
layout_engine = RapidLayout(model_type="doclayout_yolo", conf_thres=0.1)
8+
layout_engine = RapidLayout(model_type="doclayout_yolo", conf_thres=0.2)
99

10-
img_path = "tests/test_files/PMC3576793_00004.jpg"
10+
img_path = "1.jpg"
1111
img = cv2.imread(img_path)
1212

1313
boxes, scores, class_names, elapse = layout_engine(img)

rapid_layout/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
"yolov8n_layout_report": f"{ROOT_URL}/yolov8n_layout_report.onnx",
3636
"yolov8n_layout_publaynet": f"{ROOT_URL}/yolov8n_layout_publaynet.onnx",
3737
"yolov8n_layout_general6": f"{ROOT_URL}/yolov8n_layout_general6.onnx",
38-
"doclayout_yolo": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024.onnx",
38+
"doclayout_yolo": f"{ROOT_URL}/doclayout_yolo_docstructbench_imgsz1024_meta.onnx",
3939
}
4040
DEFAULT_MODEL_PATH = str(ROOT_DIR / "models" / "layout_cdla.onnx")
4141

rapid_layout/utils/augment.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# -*- encoding: utf-8 -*-
2+
# @Author: SWHL
3+
# @Contact: [email protected]
4+
import cv2
5+
import numpy as np
6+
7+
8+
class LetterBox:
9+
"""Resize image and padding for detection, instance segmentation, pose."""
10+
11+
def __init__(
12+
self,
13+
new_shape=(640, 640),
14+
auto=False,
15+
scaleFill=False,
16+
scaleup=True,
17+
center=True,
18+
stride=32,
19+
):
20+
"""Initialize LetterBox object with specific parameters."""
21+
self.new_shape = new_shape
22+
self.auto = auto
23+
self.scaleFill = scaleFill
24+
self.scaleup = scaleup
25+
self.stride = stride
26+
self.center = center # Put the image in the middle or top-left
27+
28+
def __call__(self, labels=None, image=None):
29+
"""Return updated labels and image with added border."""
30+
if labels is None:
31+
labels = {}
32+
img = labels.get("img") if image is None else image
33+
shape = img.shape[:2] # current shape [height, width]
34+
new_shape = labels.pop("rect_shape", self.new_shape)
35+
if isinstance(new_shape, int):
36+
new_shape = (new_shape, new_shape)
37+
38+
# Scale ratio (new / old)
39+
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
40+
if not self.scaleup: # only scale down, do not scale up (for better val mAP)
41+
r = min(r, 1.0)
42+
43+
# Compute padding
44+
ratio = r, r # width, height ratios
45+
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
46+
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
47+
if self.auto: # minimum rectangle
48+
dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride) # wh padding
49+
elif self.scaleFill: # stretch
50+
dw, dh = 0.0, 0.0
51+
new_unpad = (new_shape[1], new_shape[0])
52+
ratio = (
53+
new_shape[1] / shape[1],
54+
new_shape[0] / shape[0],
55+
) # width, height ratios
56+
57+
if self.center:
58+
dw /= 2 # divide padding into 2 sides
59+
dh /= 2
60+
61+
if shape[::-1] != new_unpad: # resize
62+
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
63+
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
64+
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
65+
img = cv2.copyMakeBorder(
66+
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
67+
) # add border
68+
if labels.get("ratio_pad"):
69+
labels["ratio_pad"] = (labels["ratio_pad"], (left, top)) # for evaluation
70+
71+
if len(labels):
72+
labels = self._update_labels(labels, ratio, dw, dh)
73+
labels["img"] = img
74+
labels["resized_shape"] = new_shape
75+
return labels
76+
else:
77+
return img
78+
79+
def _update_labels(self, labels, ratio, padw, padh):
80+
"""Update labels."""
81+
labels["instances"].convert_bbox(format="xyxy")
82+
labels["instances"].denormalize(*labels["img"].shape[:2][::-1])
83+
labels["instances"].scale(*ratio)
84+
labels["instances"].add_padding(padw, padh)
85+
return labels

rapid_layout/utils/post_prepross.py

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- encoding: utf-8 -*-
22
# @Author: SWHL
33
# @Contact: [email protected]
4+
import re
45
from typing import List, Tuple
56

67
import numpy as np
@@ -299,7 +300,7 @@ def extract_boxes(self, predictions):
299300

300301

301302
class DocLayoutPostProcess:
302-
def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5):
303+
def __init__(self, labels: List[str], conf_thres=0.2, iou_thres=0.5):
303304
self.labels = labels
304305
self.conf_threshold = conf_thres
305306
self.iou_threshold = iou_thres
@@ -308,40 +309,67 @@ def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5):
308309

309310
def __call__(
310311
self,
311-
output,
312+
preds,
312313
ori_img_shape: Tuple[int, int],
313314
img_shape: Tuple[int, int] = (1024, 1024),
314315
):
315-
self.img_height, self.img_width = ori_img_shape
316-
self.input_height, self.input_width = img_shape
317-
318-
output = output[0].squeeze()
319-
boxes = output[:, :-2]
320-
confidences = output[:, -2]
321-
class_ids = output[:, -1].astype(int)
322-
323-
mask = confidences > self.conf_threshold
324-
boxes = boxes[mask, :]
325-
confidences = confidences[mask]
326-
class_ids = class_ids[mask]
327-
328-
# Rescale boxes to original image dimensions
329-
boxes = rescale_boxes(
330-
boxes,
331-
self.input_width,
332-
self.input_height,
333-
self.img_width,
334-
self.img_height,
335-
)
316+
preds = preds[0]
317+
mask = preds[..., 4] > self.conf_threshold
318+
preds = [p[mask[idx]] for idx, p in enumerate(preds)][0]
319+
preds[:, :4] = scale_boxes(list(img_shape), preds[:, :4], list(ori_img_shape))
320+
321+
boxes = preds[:, :4]
322+
confidences = preds[:, 4]
323+
class_ids = preds[:, 5].astype(int)
336324
labels = [self.labels[i] for i in class_ids]
337325
return boxes, confidences, labels
338326

339327

340-
def rescale_boxes(boxes, input_width, input_height, img_width, img_height):
341-
# Rescale boxes to original image dimensions
342-
input_shape = np.array([input_width, input_height, input_width, input_height])
343-
boxes = np.divide(boxes, input_shape, dtype=np.float32)
344-
boxes *= np.array([img_width, img_height, img_width, img_height])
328+
def scale_boxes(
329+
img1_shape, boxes, img0_shape, ratio_pad=None, padding=True, xywh=False
330+
):
331+
"""
332+
Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
333+
specified in (img1_shape) to the shape of a different image (img0_shape).
334+
335+
Args:
336+
img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
337+
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
338+
img0_shape (tuple): the shape of the target image, in the format of (height, width).
339+
ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
340+
calculated based on the size difference between the two images.
341+
padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
342+
rescaling.
343+
xywh (bool): The box format is xywh or not, default=False.
344+
345+
Returns:
346+
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
347+
"""
348+
if ratio_pad is None: # calculate from img0_shape
349+
gain = min(
350+
img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]
351+
) # gain = old / new
352+
pad = (
353+
round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1),
354+
round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1),
355+
) # wh padding
356+
else:
357+
gain = ratio_pad[0][0]
358+
pad = ratio_pad[1]
359+
360+
if padding:
361+
boxes[..., 0] -= pad[0] # x padding
362+
boxes[..., 1] -= pad[1] # y padding
363+
if not xywh:
364+
boxes[..., 2] -= pad[0] # x padding
365+
boxes[..., 3] -= pad[1] # y padding
366+
boxes[..., :4] /= gain
367+
return clip_boxes(boxes, img0_shape)
368+
369+
370+
def clip_boxes(boxes, shape):
371+
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
372+
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
345373
return boxes
346374

347375

rapid_layout/utils/pre_procss.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import cv2
88
import numpy as np
99

10+
from .augment import LetterBox
11+
1012
InputType = Union[str, np.ndarray, bytes, Path]
1113

1214

@@ -57,11 +59,15 @@ class DocLayoutPreProcess:
5759

5860
def __init__(self, img_size: Tuple[int, int]):
5961
self.img_size = img_size
62+
self.letterbox = LetterBox(new_shape=img_size, auto=False, stride=32)
6063

6164
def __call__(self, image: np.ndarray) -> np.ndarray:
62-
input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
63-
input_img = cv2.resize(image, self.img_size)
64-
input_img = input_img / 255.0
65-
input_img = input_img.transpose(2, 0, 1)
66-
input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
65+
print(image.shape)
66+
input_img = self.letterbox(image=image)
67+
print(input_img.shape)
68+
input_img = input_img[None, ...]
69+
input_img = input_img[..., ::-1].transpose(0, 3, 1, 2)
70+
input_img = np.ascontiguousarray(input_img)
71+
input_img = input_img / 255
72+
input_tensor = input_img.astype(np.float32)
6773
return input_tensor

0 commit comments

Comments
 (0)