forked from opendatalab/mineru-vl-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstructs.py
More file actions
177 lines (152 loc) · 5.65 KB
/
structs.py
File metadata and controls
177 lines (152 loc) · 5.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from typing import Literal
from .vlm_client.base_client import ScoredOutput
class BlockType:
TEXT = "text" # 文本
TITLE = "title" # 段落标题
TABLE = "table" # 表格
EQUATION = "equation" # 公式(独立公式)
CODE = "code" # 代码
ALGORITHM = "algorithm" # 算法/伪代码
ASIDE_TEXT = "aside_text" # 侧栏文本(装订线等)
REF_TEXT = "ref_text" # 参考文献(一条)
PHONETIC = "phonetic" # 注音符号
LIST_ITEM = "list_item" # 列表项(无序/有序列表)
# captions
TABLE_CAPTION = "table_caption" # 表格标题
IMAGE_CAPTION = "image_caption" # 图像标题
CODE_CAPTION = "code_caption" # 代码标题
TABLE_FOOTNOTE = "table_footnote" # 表格脚注
IMAGE_FOOTNOTE = "image_footnote" # 图像脚注
# paratexts
HEADER = "header" # 页眉
FOOTER = "footer" # 页脚
PAGE_NUMBER = "page_number" # 页码
PAGE_FOOTNOTE = "page_footnote" # 脚注
# images
IMAGE = "image" # 图像
CHART = "chart"
# containers
LIST = "list" # 列表块(无序/有序列表)
IMAGE_BLOCK = "image_block" # 图像块(多图)
EQUATION_BLOCK = "equation_block" # 公式块(多行公式)
# unknown
UNKNOWN = "unknown" # 未知块
BLOCK_TYPES = {
BlockType.TEXT,
BlockType.TITLE,
BlockType.TABLE,
BlockType.EQUATION,
BlockType.CODE,
BlockType.ALGORITHM,
BlockType.ASIDE_TEXT,
BlockType.REF_TEXT,
BlockType.PHONETIC,
BlockType.LIST_ITEM,
# captions
BlockType.TABLE_CAPTION,
BlockType.IMAGE_CAPTION,
BlockType.CODE_CAPTION,
BlockType.TABLE_FOOTNOTE,
BlockType.IMAGE_FOOTNOTE,
# paratexts
BlockType.HEADER,
BlockType.FOOTER,
BlockType.PAGE_NUMBER,
BlockType.PAGE_FOOTNOTE,
# images
BlockType.IMAGE,
BlockType.CHART,
# containers
BlockType.LIST,
BlockType.IMAGE_BLOCK,
BlockType.EQUATION_BLOCK,
# unknown
BlockType.UNKNOWN,
}
ANGLE_OPTIONS = {
None,
0,
90,
180,
270,
}
class ExtractResult(list["ContentBlock"]):
"""
list[ContentBlock] subclass returned by two_step_extract() and related methods.
Backward-compatible: all existing list[ContentBlock] usage works unchanged.
When scored=True is passed to the extraction method:
- layout_scored: ScoredOutput for the layout detection step (whole-page score)
- blocks[i].scored: ScoredOutput for each content block's extraction step
"""
layout_scored: ScoredOutput | None
def __init__(self, blocks=()):
super().__init__(blocks)
self.layout_scored = None
class ContentBlock(dict):
def __init__(
self,
type: str,
bbox: list[float],
angle: Literal[None, 0, 90, 180, 270] = None,
content: str | None = None,
):
"""
Initialize a layout block.
Args:
type (str): Type of the block (e.g., 'text', 'image', 'table').
bbox (list[float]): Bounding box coordinates [xmin, ymin, xmax, ymax].
angle (int or None): Rotation angle of the block. Must be one of {None, 0, 90, 180, 270}.
content (str or None): The content of the block (if exists).
"""
super().__init__()
assert type in BLOCK_TYPES, f"Unknown type: {type}"
assert isinstance(bbox, list) and len(bbox) == 4, "Bounding box must be a list of four coordinates"
assert all(isinstance(coord, (int, float)) for coord in bbox), "Bounding box coordinates must be numbers"
assert all(0 <= coord <= 1 for coord in bbox), "Bounding box coordinates must be in the range [0, 1]"
assert bbox[0] < bbox[2], "Bounding box x1 must be less than x2"
assert bbox[1] < bbox[3], "Bounding box y1 must be less than y2"
assert angle in ANGLE_OPTIONS, f"Invalid angle: {angle}. Must be one of {ANGLE_OPTIONS}"
assert content is None or isinstance(content, str), "Content must be a string or None"
self["type"] = type
self["bbox"] = bbox
self["angle"] = angle
self["content"] = content
self["scored"] = None
@property
def type(self) -> str:
return self["type"]
@type.setter
def type(self, value: str):
assert value in BLOCK_TYPES, f"Unknown type: {value}"
self["type"] = value
@property
def bbox(self) -> list[float]:
return self["bbox"]
@bbox.setter
def bbox(self, value: list[float]):
assert isinstance(value, list) and len(value) == 4, "Bounding box must be a list of four coordinates"
assert all(isinstance(coord, (int, float)) for coord in value), "Bounding box coordinates must be numbers"
assert all(0 <= coord <= 1 for coord in value), "Bounding box coordinates must be in the range [0, 1]"
assert value[0] < value[2], "Bounding box x1 must be less than x2"
assert value[1] < value[3], "Bounding box y1 must be less than y2"
self["bbox"] = value
@property
def angle(self) -> Literal[None, 0, 90, 180, 270]:
return self["angle"]
@angle.setter
def angle(self, value: Literal[None, 0, 90, 180, 270]):
assert value in ANGLE_OPTIONS, f"Invalid angle: {value}. Must be one of {ANGLE_OPTIONS}"
self["angle"] = value
@property
def content(self) -> str | None:
return self["content"]
@content.setter
def content(self, value: str | None):
assert value is None or isinstance(value, str), "Content must be a string or None"
self["content"] = value
@property
def scored(self) -> ScoredOutput | None:
return self["scored"]
@scored.setter
def scored(self, value: ScoredOutput | None):
self["scored"] = value