|
| 1 | +from abc import ABCMeta |
| 2 | +import json |
| 3 | +from typing import Dict, Any, List, Optional, Union, override |
| 4 | + |
| 5 | +from webmainbench.metrics.base import DocElementType, ParagraphTextType |
| 6 | + |
| 7 | +def normalize_math_delimiters(text: str) -> str: |
| 8 | + """将[tex][/tex]和[itex][/itex]格式的数学公式转换为$$..$$和$..$ 格式. |
| 9 | +
|
| 10 | + 这是兜底处理,针对公式被br标签分割后没有识别为公式的情况. |
| 11 | + 处理两种情况: |
| 12 | + 1. 行间公式: [tex]...[/tex] -> $$...$$ |
| 13 | + 2. 行内公式: [itex]...[/itex] -> $...$ |
| 14 | + 该方法保留公式内容的原始格式,包括换行符和空格。 |
| 15 | + Args: |
| 16 | + text (str): 包含数学公式的文本 |
| 17 | + Returns: |
| 18 | + str: 替换数学公式标记后的文本 |
| 19 | + """ |
| 20 | + import re |
| 21 | + |
| 22 | + # 替换行间公式 [tex]...[/tex] -> $$...$$ |
| 23 | + # 使用非贪婪匹配和DOTALL标志以匹配跨行公式 |
| 24 | + display_pattern = re.compile(r'\[tex\](.*?)\[/tex\]', re.DOTALL) |
| 25 | + text = display_pattern.sub(lambda m: f'$${m.group(1).strip()}$$', text) |
| 26 | + |
| 27 | + # 替换行内公式 [itex]...[/itex] -> $...$ |
| 28 | + inline_pattern = re.compile(r'\[itex\](.*?)\[/itex\]', re.DOTALL) |
| 29 | + text = inline_pattern.sub(lambda m: f'${m.group(1).strip()}$', text) |
| 30 | + |
| 31 | + return text |
| 32 | + |
| 33 | +class ABC(metaclass=ABCMeta): |
| 34 | + """Helper class that provides a standard way to create an ABC using |
| 35 | + inheritance. |
| 36 | + """ |
| 37 | + __slots__ = () |
| 38 | + |
| 39 | +class StructureMapper(ABC): |
| 40 | + """作用是把contentList结构组合转化为另外一个结构 例如,从contentList转化为html, txt, md等等. |
| 41 | +
|
| 42 | + Args: |
| 43 | + object (_type_): _description_ |
| 44 | + """ |
| 45 | + def __init__(self): |
| 46 | + self.__txt_para_splitter = '\n' |
| 47 | + self.__md_para_splitter = '\n\n' |
| 48 | + self.__text_end = '\n' |
| 49 | + self.__list_item_start = '-' # md里的列表项前缀 |
| 50 | + self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀 |
| 51 | + self.__md_special_chars = ['#', '`', '$'] # TODO 拼装table的时候还应该转义掉|符号 |
| 52 | + self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE, DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO, DocElementType.CODE, DocElementType.EQUATION_INTERLINE] |
| 53 | + self.__inline_types_document_type = [ParagraphTextType.EQUATION_INLINE, ParagraphTextType.CODE_INLINE] |
| 54 | + |
| 55 | + def to_html(self): |
| 56 | + raise NotImplementedError('This method must be implemented by the subclass.') |
| 57 | + |
| 58 | + def to_txt(self, exclude_nodes=DocElementType.MM_NODE_LIST, exclude_inline_types=[]): |
| 59 | + """把content_list转化为txt格式. |
| 60 | +
|
| 61 | + Args: |
| 62 | + exclude_nodes (list): 需要排除的节点类型 |
| 63 | + Returns: |
| 64 | + str: txt格式的文本内容 |
| 65 | + """ |
| 66 | + text_blocks: list[str] = [] # 每个是个DocElementType规定的元素块之一转换成的文本 |
| 67 | + content_lst = self._get_data() |
| 68 | + for page in content_lst: |
| 69 | + for content_lst_node in page: |
| 70 | + if content_lst_node['type'] not in exclude_nodes: |
| 71 | + txt_content = self.__content_lst_node_2_txt(content_lst_node, exclude_inline_types) |
| 72 | + if txt_content and len(txt_content) > 0: |
| 73 | + text_blocks.append(txt_content) |
| 74 | + |
| 75 | + txt = self.__txt_para_splitter.join(text_blocks) |
| 76 | + txt = normalize_math_delimiters(txt) |
| 77 | + txt = txt.strip() + self.__text_end # 加上结尾换行符 |
| 78 | + return txt |
| 79 | + |
| 80 | +class ContentList(StructureMapper): |
| 81 | + """content_list格式的工具链实现.""" |
| 82 | + |
| 83 | + def __init__(self, json_data_lst: list): |
| 84 | + super().__init__() |
| 85 | + if json_data_lst is None: |
| 86 | + json_data_lst = [] |
| 87 | + self.__content_list = json_data_lst |
| 88 | + |
| 89 | + def length(self) -> int: |
| 90 | + return len(self.__content_list) |
| 91 | + |
| 92 | + def append(self, content: dict): |
| 93 | + self.__content_list.append(content) |
| 94 | + |
| 95 | + def __getitem__(self, key): |
| 96 | + return self.__content_list[key] # 提供读取功能 |
| 97 | + |
| 98 | + def __setitem__(self, key, value): |
| 99 | + self.__content_list[key] = value # 提供设置功能 |
| 100 | + |
| 101 | + def __delitem__(self, key): |
| 102 | + del self.__content_list[key] |
| 103 | + |
| 104 | + @override |
| 105 | + def _get_data(self) -> List[Dict]: |
| 106 | + return self.__content_list |
| 107 | + |
| 108 | +class Statics: |
| 109 | + """统计content_list中每个元素的type的数量.""" |
| 110 | + def __init__(self, statics: dict = None): |
| 111 | + self.statics = statics if statics else {} |
| 112 | + self._validate(self.statics) |
| 113 | + |
| 114 | + def _validate(self, statics: dict): |
| 115 | + """校验statics的格式.需要是字典且只有一个为"statics"的key.示例: |
| 116 | + { |
| 117 | + "list": 1, |
| 118 | + "list.text": 2, |
| 119 | + "list.equation-inline": 1, |
| 120 | + "paragraph": 2, |
| 121 | + "paragraph.text": 2, |
| 122 | + "equation-interline": 2 |
| 123 | + } |
| 124 | + """ |
| 125 | + if not isinstance(statics, dict): |
| 126 | + raise ValueError('statics must be a dict') |
| 127 | + |
| 128 | + def __additem__(self, key, value): |
| 129 | + self.statics[key] = value |
| 130 | + |
| 131 | + def __getitem__(self, key): |
| 132 | + return self.statics[key] |
| 133 | + |
| 134 | + def __getall__(self): |
| 135 | + return self.statics |
| 136 | + |
| 137 | + def __clear__(self): |
| 138 | + self.statics = {} |
| 139 | + |
| 140 | + def print(self): |
| 141 | + print(json.dumps(self.statics, indent=4)) |
| 142 | + |
| 143 | + def merge_statics(self, statics: dict) -> dict: |
| 144 | + """合并多个contentlist的统计结果. |
| 145 | +
|
| 146 | + Args: |
| 147 | + statics: 每个contentlist的统计结果 |
| 148 | + Returns: |
| 149 | + dict: 合并后的统计结果 |
| 150 | + """ |
| 151 | + for key, value in statics.items(): |
| 152 | + if isinstance(value, (int, float)): |
| 153 | + self.statics[key] = self.statics.get(key, 0) + value |
| 154 | + |
| 155 | + return self.statics |
| 156 | + |
| 157 | + def get_statics(self, contentlist) -> dict: |
| 158 | + """ |
| 159 | + 统计contentlist中每个元素的type的数量(会清空之前的数据) |
| 160 | + Args: |
| 161 | + contentlist: 可以是ContentList对象或直接的列表数据 |
| 162 | + Returns: |
| 163 | + dict: 每个元素的类型的数量 |
| 164 | + """ |
| 165 | + self.__clear__() |
| 166 | + return self._calculate_statics(contentlist) |
| 167 | + |
| 168 | + def add_statics(self, contentlist) -> dict: |
| 169 | + """ |
| 170 | + 统计contentlist中每个元素的type的数量(累计到现有数据) |
| 171 | + Args: |
| 172 | + contentlist: 可以是ContentList对象或直接的列表数据 |
| 173 | + Returns: |
| 174 | + dict: 累计后的统计结果 |
| 175 | + """ |
| 176 | + return self._calculate_statics(contentlist) |
| 177 | + |
| 178 | + def _calculate_statics(self, contentlist) -> dict: |
| 179 | + """ |
| 180 | + 内部方法:计算contentlist的统计结果 |
| 181 | + Args: |
| 182 | + contentlist: 可以是ContentList对象或直接的列表数据 |
| 183 | + Returns: |
| 184 | + dict: 统计结果 |
| 185 | + """ |
| 186 | + def process_list_items(items, parent_type): |
| 187 | + """递归处理列表项 |
| 188 | + Args: |
| 189 | + items: 列表项 |
| 190 | + parent_type: 父元素类型(用于构建统计key) |
| 191 | + """ |
| 192 | + if isinstance(items, list): |
| 193 | + for item in items: |
| 194 | + process_list_items(item, parent_type) |
| 195 | + elif isinstance(items, dict) and 't' in items: |
| 196 | + # 到达最终的文本/公式元素 |
| 197 | + item_type = f"{parent_type}.{items['t']}" |
| 198 | + current_count = self.statics.get(item_type, 0) |
| 199 | + self.statics[item_type] = current_count + 1 |
| 200 | + |
| 201 | + # 处理不同类型的输入 |
| 202 | + if hasattr(contentlist, '_get_data'): |
| 203 | + # 如果是ContentList对象 |
| 204 | + data = contentlist._get_data() |
| 205 | + else: |
| 206 | + # 如果是直接的列表数据 |
| 207 | + data = contentlist |
| 208 | + |
| 209 | + for page in data: # page是每一页的内容列表 |
| 210 | + for element in page: # element是每个具体元素 |
| 211 | + # 1. 统计基础元素 |
| 212 | + element_type = element['type'] |
| 213 | + current_count = self.statics.get(element_type, 0) |
| 214 | + self.statics[element_type] = current_count + 1 |
| 215 | + |
| 216 | + # 2. 统计复合元素内部结构 |
| 217 | + if element_type == DocElementType.PARAGRAPH: |
| 218 | + # 段落内部文本类型统计 |
| 219 | + for item in element['content']: |
| 220 | + item_type = f"{DocElementType.PARAGRAPH}.{item['t']}" |
| 221 | + current_count = self.statics.get(item_type, 0) |
| 222 | + self.statics[item_type] = current_count + 1 |
| 223 | + |
| 224 | + elif element_type == DocElementType.LIST: |
| 225 | + # 使用递归函数处理列表项 |
| 226 | + process_list_items(element['content']['items'], DocElementType.LIST) |
| 227 | + elif element_type == DocElementType.COMPLEX_TABLE: |
| 228 | + # 统计复杂表格数量 |
| 229 | + if element.get('content', {}).get('is_complex', False): |
| 230 | + item_type = f'{DocElementType.COMPLEX_TABLE}.complex' |
| 231 | + current_count = self.statics.get(item_type, 0) |
| 232 | + self.statics[item_type] = current_count + 1 |
| 233 | + |
| 234 | + return self.statics |
| 235 | + |
0 commit comments