Skip to content

Commit f86d4fa

Browse files
Startalkersyc366
andauthored
feature: add unstructured formatter operator for doc/docx (#17)
* feature: add UnstructuredFormatter * feature: add UnstructuredFormatter in db * feature: add unstructured[docx]==0.18.15 * feature: support doc --------- Co-authored-by: Startalker <[email protected]>
1 parent c52702b commit f86d4fa

File tree

7 files changed

+63
-3
lines changed

7 files changed

+63
-3
lines changed

runtime/ops/formatter/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def _import_operators():
2020
from . import img_formatter
2121
from . import file_exporter
2222
from . import slide_formatter
23+
from . import unstructured_formatter
2324

2425

2526
_import_operators()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from datamate.core.base_op import OPERATORS
4+
5+
OPERATORS.register_module(module_name='UnstructuredFormatter',
6+
module_path="ops.formatter.unstructured_formatter.process")
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
name: '非结构化文本抽取'
2+
name_en: 'Unstructured Text Extraction'
3+
description: '抽取非结构化文件的文本,目前支持word文档'
4+
description_en: 'Extracts text from Unstructured files, currently supporting Word documents.'
5+
language: 'python'
6+
vendor: 'huawei'
7+
raw_id: 'UnstructuredFormatter'
8+
version: '1.0.0'
9+
types:
10+
- 'collect'
11+
modal: 'text'
12+
effect:
13+
before: ''
14+
after: ''
15+
inputs: 'text'
16+
outputs: 'text'
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
#!/user/bin/python
3+
# -*- coding: utf-8 -*-
4+
5+
"""
6+
Description: 非结构化文本抽取
7+
Create: 2025/10/22 15:15
8+
"""
9+
import time
10+
from typing import Dict, Any
11+
12+
from loguru import logger
13+
from unstructured.partition.auto import partition
14+
15+
from datamate.core.base_op import Mapper
16+
17+
18+
class UnstructuredFormatter(Mapper):
19+
"""把输入的非结构化文本抽取为txt"""
20+
21+
def __init__(self, *args, **kwargs):
22+
super(UnstructuredFormatter, self).__init__(*args, **kwargs)
23+
24+
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
25+
start = time.time()
26+
filepath = sample.get(self.filepath_key)
27+
filename = sample.get(self.filename_key)
28+
try:
29+
elements = partition(filename=filepath)
30+
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
31+
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
32+
except UnicodeDecodeError as err:
33+
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
34+
raise
35+
return sample

runtime/ops/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ xmltodict==1.0.2
1919
zhconv==1.4.3
2020
sqlalchemy==2.0.40
2121
pymysql==1.1.1
22+
unstructured[docx]==0.18.15

scripts/db/data-operator-init.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ VALUES (1, '模态', 'predefined', 0),
6868
INSERT IGNORE INTO t_operator
6969
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
7070
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
71+
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false),
7172
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
7273
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
7374
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
@@ -121,7 +122,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe
121122
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
122123
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
123124
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
124-
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
125+
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'UnstructuredFormatter');
125126

126127
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
127128
SELECT c.id, o.id
@@ -137,4 +138,4 @@ SELECT c.id, o.id
137138
FROM t_operator_category c
138139
CROSS JOIN t_operator o
139140
WHERE c.id IN (7, 8, 11)
140-
AND o.id IN ('FileExporter');
141+
AND o.id IN ('FileExporter', 'UnstructuredFormatter');

scripts/images/runtime/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ ENV PYTHONPATH=/opt/runtime/datamate/
77

88
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
99
&& apt update \
10-
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \
10+
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 libreoffice\
1111
&& apt clean \
1212
&& rm -rf /var/lib/apt/lists/*
1313

0 commit comments

Comments
 (0)