Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,26 @@
public class CleanTaskValidator {
private final CleaningTaskRepository cleaningTaskRepo;

public void checkNameDuplication (String name) {
public void checkNameDuplication(String name) {
if (cleaningTaskRepo.isNameExist(name)) {
throw BusinessException.of(CleanErrorCode.DUPLICATE_TASK_NAME);
}
}

public void checkInputAndOutput (List<OperatorInstanceDto> operators) {
public void checkInputAndOutput(List<OperatorInstanceDto> operators) {
if (operators == null || operators.size() <= 1) {
return;
}
for (int i = 1; i < operators.size(); i++) {
OperatorInstanceDto front = operators.get(i - 1);
OperatorInstanceDto back = operators.get(i);
if (!StringUtils.equals(front.getOutputs(), back.getInputs())) {
throw BusinessException.of(CleanErrorCode.IN_AND_OUT_NOT_MATCH,
String.format(Locale.ROOT, "ops(name: [%s, %s]) inputs and outputs does not match",
front.getName(), back.getName()));
if (StringUtils.equals(front.getOutputs(), back.getInputs()) || StringUtils.equalsAny("multimodal",
front.getOutputs(), back.getOutputs())) {
continue;
}
throw BusinessException.of(CleanErrorCode.IN_AND_OUT_NOT_MATCH,
String.format(Locale.ROOT, "ops(name: [%s, %s]) inputs and outputs does not match",
front.getName(), back.getName()));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ spec:
{{- toYaml . | nindent 12 }}
{{- end }}
image: "{{ include "backend.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }}
imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,5 @@ Name of image
{{- define "database.image" -}}
{{- $name := default .Values.image.repository .Values.global.image.database.name }}
{{- $tag := default .Values.image.tag .Values.global.image.database.tag }}
{{- if .Values.global.image.repository }}
{{- .Values.global.image.repository | trimSuffix "/" }}/{{ $name }}:{{ $tag }}
{{- else }}
{{- $name }}:{{ $tag }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ spec:
{{- toYaml . | nindent 12 }}
{{- end }}
image: "{{ include "database.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }}
imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ spec:
{{- toYaml . | nindent 12 }}
{{- end }}
image: "{{ include "frontend.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }}
imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
Expand Down
2 changes: 2 additions & 0 deletions deployment/helm/datamate/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# Declare variables to be passed into your templates.

global:
deerFlow:
enable: false
image:
repository: ""
pullPolicy: "IfNotPresent"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ spec:
{{- toYaml . | nindent 12 }}
{{- end }}
image: "{{ include "backend.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }}
imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ spec:
{{- toYaml . | nindent 12 }}
{{- end }}
image: "{{ include "frontend.image" . }}"
imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }}
imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
Expand Down
2 changes: 1 addition & 1 deletion runtime/ops/formatter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def _import_operators():
from . import file_exporter
from . import slide_formatter
from . import unstructured_formatter
from . import external_pdf_formatter
from . import mineru_formatter


_import_operators()
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

from datamate.core.base_op import OPERATORS

OPERATORS.register_module(module_name='ExternalPDFFormatter',
OPERATORS.register_module(module_name='MineruFormatter',
module_path="ops.formatter.external_pdf_formatter.process")
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
name: 'MinerU PDF文本抽取'
name_en: 'External PDF Text Extraction'
name_en: 'MinerU PDF Text Extraction'
description: '基于MinerU API,抽取PDF中的文本。'
description_en: 'Extracts text from PDF files based on MinerU API.'
language: 'python'
vendor: 'huawei'
raw_id: 'ExternalPDFFormatter'
raw_id: 'MineruFormatter'
version: '1.0.0'
types:
- 'collect'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
from datamate.common.utils.rest_client import http_request


class ExternalPDFFormatter(Mapper):
class MineruFormatter(Mapper):
"""基于外部API,抽取PDF中的文本"""

def __init__(self, *args, **kwargs):
super(ExternalPDFFormatter, self).__init__(*args, **kwargs)
super(MineruFormatter, self).__init__(*args, **kwargs)
self.base_url = os.getenv("EXTERNAL_PDF_BASE_URL", "http://datamate-mineru:9001")
self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"

Expand All @@ -31,8 +31,8 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
response = http_request(method="POST", url=self.pdf_extract_url, data=data)
sample[self.text_key] = json.loads(response.text).get("result")
logger.info(
f"fileName: {filename}, method: ExternalPDFFormatter costs {(time.time() - start):6f} s")
f"fileName: {filename}, method: MineruFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {filename}, method: ExternalPDFFormatter causes decode error: {err}")
logger.exception(f"fileName: {filename}, method: MineruFormatter causes decode error: {err}")
raise
return sample
2 changes: 1 addition & 1 deletion runtime/ops/formatter/unstructured_formatter/metadata.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: '非结构化文本抽取'
name: 'Unstructured文本抽取'
name_en: 'Unstructured Text Extraction'
description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'
Expand Down
28 changes: 22 additions & 6 deletions runtime/python-executor/datamate/common/utils/lazy_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class LazyLoader(ModuleType):
def __init__(self,
package_name,
module_name=None,
whl_path="/dataset/ops_whl",
whl_path=None,
exact_version=None,
force_reinstall=False
):
Expand All @@ -72,7 +72,7 @@ def __init__(self,
self._module_name = module_name if module_name else package_name
self._package_name = package_name

self.whl_path = Path(whl_path).resolve()
self.whl_path = whl_path
self.exact_version = exact_version

self.force_reinstall = force_reinstall
Expand Down Expand Up @@ -126,7 +126,10 @@ def _load_module(self):
need_install = True

if need_install:
self._pip_install_package(package_name)
if self.whl_path is None:
self._pip_install_package_pypi(package_name)
else:
self._pip_install_package_local(package_name)
module = importlib.import_module(module_name)
self._cached_module = module
self._register_alias(module)
Expand Down Expand Up @@ -168,13 +171,26 @@ def _get_installed_version(self, package_name):
return line.split()[-1]
raise PackageNotFoundError()

def _pip_install_package(self, package_name: str):
def _pip_install_package_pypi(self, package_name: str):
if self.exact_version:
package_name += f"=={self.exact_version}"
try:
subprocess.check_call([
sys.executable, "-m", "pip", "install", str(package_name)
], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
logger.info(f"Successfully installed {package_name}")
except subprocess.CalledProcessError as e:
logger.error(f"Installation failed: {e}")
raise RuntimeError(f"Installation failed: {e}") from e

def _pip_install_package_local(self, package_name: str):
"""安装逻辑 """

if not self.whl_path.exists():
whl_path = Path(self.whl_path).resolve()
if not whl_path.exists():
raise FileNotFoundError(f"WHL directory not found: {self.whl_path}")

whl_files = list(self.whl_path.glob(f"{package_name}*.whl"))
whl_files = list(whl_path.glob(f"{package_name}*.whl"))
if not whl_files:
raise RuntimeError(f"No WHL files found for {package_name}")

Expand Down
4 changes: 2 additions & 2 deletions scripts/db/data-operator-init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined'
INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false),
('ExternalPDFFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('UnstructuredFormatter', 'Unstructured文本抽取', '基于Unstructured抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false),
('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
Expand Down
16 changes: 8 additions & 8 deletions scripts/images/runtime/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.11
FROM ghcr.io/astral-sh/uv:python3.11-bookworm

COPY runtime/python-executor /opt/runtime
COPY runtime/ops /opt/runtime/datamate/ops
Expand All @@ -7,16 +7,16 @@ COPY scripts/images/runtime/start.sh /opt/runtime/start.sh

ENV PYTHONPATH=/opt/runtime/datamate/

RUN apt update \
&& apt install -y libgl1 libglib2.0-0 vim libmagic1t64 libreoffice dos2unix \
&& apt clean \
&& rm -rf /var/lib/apt/lists/*
RUN --mount=type=cache,target=/var/cache/apt \
--mount=type=cache,target=/var/lib/apt \
apt update \
&& apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix

WORKDIR /opt/runtime

RUN pip install -e . --trusted-host mirrors.huaweicloud.com -i https://mirrors.huaweicloud.com/repository/pypi/simple \
&& pip install -r /opt/runtime/datamate/ops/requirements.txt --trusted-host mirrors.huaweicloud.com -i https://mirrors.huaweicloud.com/repository/pypi/simple \
&& pip cache purge
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e . --system \
&& uv pip install -r /opt/runtime/datamate/ops/requirements.txt --system

RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& chmod +x /opt/runtime/start.sh \
Expand Down