feature: add unstructured formatter operator for doc/docx (#17)

Startalker · syc366 · web-flow · commit f86d4fae250f · 2025-10-23T16:49:03.000+08:00
* feature: add UnstructuredFormatter

* feature: add UnstructuredFormatter in db

* feature: add unstructured[docx]==0.18.15

* feature: support doc

---------

Co-authored-by: Startalker &lt;438747480@qq.com&gt;
diff --git a/runtime/ops/formatter/__init__.py b/runtime/ops/formatter/__init__.py
@@ -20,6 +20,7 @@ def _import_operators():
     from . import img_formatter
     from . import file_exporter
     from . import slide_formatter
+    from . import unstructured_formatter
 
 
 _import_operators()
diff --git a/runtime/ops/formatter/unstructured_formatter/__init__.py b/runtime/ops/formatter/unstructured_formatter/__init__.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='UnstructuredFormatter',
+                          module_path="ops.formatter.unstructured_formatter.process")
diff --git a/runtime/ops/formatter/unstructured_formatter/metadata.yml b/runtime/ops/formatter/unstructured_formatter/metadata.yml
@@ -0,0 +1,16 @@
+name: '非结构化文本抽取'
+name_en: 'Unstructured Text Extraction'
+description: '抽取非结构化文件的文本，目前支持word文档'
+description_en: 'Extracts text from Unstructured files, currently supporting Word documents.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'UnstructuredFormatter'
+version: '1.0.0'
+types:
+  - 'collect'
+modal: 'text'
+effect:
+  before: ''
+  after: ''
+inputs: 'text'
+outputs: 'text'
diff --git a/runtime/ops/formatter/unstructured_formatter/process.py b/runtime/ops/formatter/unstructured_formatter/process.py
@@ -0,0 +1,35 @@
+
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 非结构化文本抽取
+Create: 2025/10/22 15:15
+"""
+import time
+from typing import Dict, Any
+
+from loguru import logger
+from unstructured.partition.auto import partition
+
+from datamate.core.base_op import Mapper
+
+
+class UnstructuredFormatter(Mapper):
+    """把输入的非结构化文本抽取为txt"""
+
+    def __init__(self, *args, **kwargs):
+        super(UnstructuredFormatter, self).__init__(*args, **kwargs)
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        filepath = sample.get(self.filepath_key)
+        filename = sample.get(self.filename_key)
+        try:
+            elements = partition(filename=filepath)
+            sample[self.text_key] = "\n\n".join([str(el) for el in elements])
+            logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
+        except UnicodeDecodeError as err:
+            logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
+            raise
+        return sample
diff --git a/runtime/ops/requirements.txt b/runtime/ops/requirements.txt
@@ -19,3 +19,4 @@ xmltodict==1.0.2
 zhconv==1.4.3
 sqlalchemy==2.0.40
 pymysql==1.1.1
+unstructured[docx]==0.18.15
diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql
@@ -68,6 +68,7 @@ VALUES (1, '模态', 'predefined', 0),
 INSERT IGNORE INTO t_operator
 (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
 VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
+       ('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本，目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false),
        ('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
        ('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值，该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时，选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
        ('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值，该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
@@ -121,7 +122,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe
             'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
             'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
             'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
-            'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
+            'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'UnstructuredFormatter');
 
 INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
 SELECT c.id, o.id
@@ -137,4 +138,4 @@ SELECT c.id, o.id
 FROM t_operator_category c
        CROSS JOIN t_operator o
 WHERE c.id IN (7, 8, 11)
-  AND o.id IN ('FileExporter');
+  AND o.id IN ('FileExporter', 'UnstructuredFormatter');
diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile
@@ -7,7 +7,7 @@ ENV PYTHONPATH=/opt/runtime/datamate/
 
 RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
     && apt update \
-    && apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \
+    && apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64  libreoffice\
     && apt clean \
     && rm -rf /var/lib/apt/lists/*