diff --git a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java index 92f3d29e9..8732a3e8a 100644 --- a/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java +++ b/backend/services/data-cleaning-service/src/main/java/com/datamate/cleaning/infrastructure/validator/CleanTaskValidator.java @@ -17,24 +17,26 @@ public class CleanTaskValidator { private final CleaningTaskRepository cleaningTaskRepo; - public void checkNameDuplication (String name) { + public void checkNameDuplication(String name) { if (cleaningTaskRepo.isNameExist(name)) { throw BusinessException.of(CleanErrorCode.DUPLICATE_TASK_NAME); } } - public void checkInputAndOutput (List operators) { + public void checkInputAndOutput(List operators) { if (operators == null || operators.size() <= 1) { return; } for (int i = 1; i < operators.size(); i++) { OperatorInstanceDto front = operators.get(i - 1); OperatorInstanceDto back = operators.get(i); - if (!StringUtils.equals(front.getOutputs(), back.getInputs())) { - throw BusinessException.of(CleanErrorCode.IN_AND_OUT_NOT_MATCH, - String.format(Locale.ROOT, "ops(name: [%s, %s]) inputs and outputs does not match", - front.getName(), back.getName())); + if (StringUtils.equals(front.getOutputs(), back.getInputs()) || StringUtils.equalsAny("multimodal", + front.getOutputs(), back.getOutputs())) { + continue; } + throw BusinessException.of(CleanErrorCode.IN_AND_OUT_NOT_MATCH, + String.format(Locale.ROOT, "ops(name: [%s, %s]) inputs and outputs does not match", + front.getName(), back.getName())); } } } diff --git a/deployment/helm/datamate/charts/backend/templates/deployment.yaml b/deployment/helm/datamate/charts/backend/templates/deployment.yaml index 3d12f0524..cfed2e430 100644 --- a/deployment/helm/datamate/charts/backend/templates/deployment.yaml +++ b/deployment/helm/datamate/charts/backend/templates/deployment.yaml @@ -39,7 +39,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} image: "{{ include "backend.image" . }}" - imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} + imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }} ports: - name: http containerPort: {{ .Values.service.port }} diff --git a/deployment/helm/datamate/charts/database/templates/_helpers.tpl b/deployment/helm/datamate/charts/database/templates/_helpers.tpl index 24ff28228..5a91f49b3 100644 --- a/deployment/helm/datamate/charts/database/templates/_helpers.tpl +++ b/deployment/helm/datamate/charts/database/templates/_helpers.tpl @@ -67,9 +67,5 @@ Name of image {{- define "database.image" -}} {{- $name := default .Values.image.repository .Values.global.image.database.name }} {{- $tag := default .Values.image.tag .Values.global.image.database.tag }} -{{- if .Values.global.image.repository }} -{{- .Values.global.image.repository | trimSuffix "/" }}/{{ $name }}:{{ $tag }} -{{- else }} {{- $name }}:{{ $tag }} {{- end }} -{{- end }} diff --git a/deployment/helm/datamate/charts/database/templates/deployment.yaml b/deployment/helm/datamate/charts/database/templates/deployment.yaml index 92a150fbc..db4e0a5a4 100644 --- a/deployment/helm/datamate/charts/database/templates/deployment.yaml +++ b/deployment/helm/datamate/charts/database/templates/deployment.yaml @@ -61,7 +61,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} image: "{{ include "database.image" . }}" - imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} + imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }} ports: - name: http containerPort: {{ .Values.service.port }} diff --git a/deployment/helm/datamate/charts/frontend/templates/deployment.yaml b/deployment/helm/datamate/charts/frontend/templates/deployment.yaml index c8ddbc443..6ca81ab14 100644 --- a/deployment/helm/datamate/charts/frontend/templates/deployment.yaml +++ b/deployment/helm/datamate/charts/frontend/templates/deployment.yaml @@ -40,7 +40,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} image: "{{ include "frontend.image" . }}" - imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} + imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }} ports: - name: http containerPort: {{ .Values.service.port }} diff --git a/deployment/helm/datamate/values.yaml b/deployment/helm/datamate/values.yaml index 97aba1fdb..2ecd0ba74 100644 --- a/deployment/helm/datamate/values.yaml +++ b/deployment/helm/datamate/values.yaml @@ -3,6 +3,8 @@ # Declare variables to be passed into your templates. global: + deerFlow: + enable: false image: repository: "" pullPolicy: "IfNotPresent" diff --git a/deployment/helm/deer-flow/charts/backend/templates/deployment.yaml b/deployment/helm/deer-flow/charts/backend/templates/deployment.yaml index b7290fe35..c29331681 100644 --- a/deployment/helm/deer-flow/charts/backend/templates/deployment.yaml +++ b/deployment/helm/deer-flow/charts/backend/templates/deployment.yaml @@ -39,7 +39,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} image: "{{ include "backend.image" . }}" - imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} + imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }} ports: - name: http containerPort: {{ .Values.service.port }} diff --git a/deployment/helm/deer-flow/charts/frontend/templates/deployment.yaml b/deployment/helm/deer-flow/charts/frontend/templates/deployment.yaml index a5f2af15f..684cdaabf 100644 --- a/deployment/helm/deer-flow/charts/frontend/templates/deployment.yaml +++ b/deployment/helm/deer-flow/charts/frontend/templates/deployment.yaml @@ -39,7 +39,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} image: "{{ include "frontend.image" . }}" - imagePullPolicy: {{ default .Values.global.image.pullPolicy .Values.image.pullPolicy }} + imagePullPolicy: {{ default .Values.image.pullPolicy .Values.global.image.pullPolicy }} ports: - name: http containerPort: {{ .Values.service.port }} diff --git a/runtime/ops/formatter/__init__.py b/runtime/ops/formatter/__init__.py index 814d224a8..301eab18d 100644 --- a/runtime/ops/formatter/__init__.py +++ b/runtime/ops/formatter/__init__.py @@ -21,7 +21,7 @@ def _import_operators(): from . import file_exporter from . import slide_formatter from . import unstructured_formatter - from . import external_pdf_formatter + from . import mineru_formatter _import_operators() diff --git a/runtime/ops/formatter/external_pdf_formatter/__init__.py b/runtime/ops/formatter/mineru_formatter/__init__.py similarity index 71% rename from runtime/ops/formatter/external_pdf_formatter/__init__.py rename to runtime/ops/formatter/mineru_formatter/__init__.py index dfbb651ec..698b60803 100644 --- a/runtime/ops/formatter/external_pdf_formatter/__init__.py +++ b/runtime/ops/formatter/mineru_formatter/__init__.py @@ -2,5 +2,5 @@ from datamate.core.base_op import OPERATORS -OPERATORS.register_module(module_name='ExternalPDFFormatter', +OPERATORS.register_module(module_name='MineruFormatter', module_path="ops.formatter.external_pdf_formatter.process") diff --git a/runtime/ops/formatter/external_pdf_formatter/metadata.yml b/runtime/ops/formatter/mineru_formatter/metadata.yml similarity index 81% rename from runtime/ops/formatter/external_pdf_formatter/metadata.yml rename to runtime/ops/formatter/mineru_formatter/metadata.yml index d7a012656..ac3043bcf 100644 --- a/runtime/ops/formatter/external_pdf_formatter/metadata.yml +++ b/runtime/ops/formatter/mineru_formatter/metadata.yml @@ -1,10 +1,10 @@ name: 'MinerU PDF文本抽取' -name_en: 'External PDF Text Extraction' +name_en: 'MinerU PDF Text Extraction' description: '基于MinerU API,抽取PDF中的文本。' description_en: 'Extracts text from PDF files based on MinerU API.' language: 'python' vendor: 'huawei' -raw_id: 'ExternalPDFFormatter' +raw_id: 'MineruFormatter' version: '1.0.0' types: - 'collect' diff --git a/runtime/ops/formatter/external_pdf_formatter/process.py b/runtime/ops/formatter/mineru_formatter/process.py similarity index 77% rename from runtime/ops/formatter/external_pdf_formatter/process.py rename to runtime/ops/formatter/mineru_formatter/process.py index 1b91c48af..f1cff86c9 100644 --- a/runtime/ops/formatter/external_pdf_formatter/process.py +++ b/runtime/ops/formatter/mineru_formatter/process.py @@ -15,11 +15,11 @@ from datamate.common.utils.rest_client import http_request -class ExternalPDFFormatter(Mapper): +class MineruFormatter(Mapper): """基于外部API,抽取PDF中的文本""" def __init__(self, *args, **kwargs): - super(ExternalPDFFormatter, self).__init__(*args, **kwargs) + super(MineruFormatter, self).__init__(*args, **kwargs) self.base_url = os.getenv("EXTERNAL_PDF_BASE_URL", "http://datamate-mineru:9001") self.pdf_extract_url = f"{self.base_url}/api/pdf-extract" @@ -31,8 +31,8 @@ def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: response = http_request(method="POST", url=self.pdf_extract_url, data=data) sample[self.text_key] = json.loads(response.text).get("result") logger.info( - f"fileName: {filename}, method: ExternalPDFFormatter costs {(time.time() - start):6f} s") + f"fileName: {filename}, method: MineruFormatter costs {(time.time() - start):6f} s") except UnicodeDecodeError as err: - logger.exception(f"fileName: {filename}, method: ExternalPDFFormatter causes decode error: {err}") + logger.exception(f"fileName: {filename}, method: MineruFormatter causes decode error: {err}") raise return sample diff --git a/runtime/ops/formatter/unstructured_formatter/metadata.yml b/runtime/ops/formatter/unstructured_formatter/metadata.yml index 66063c658..be68175fa 100644 --- a/runtime/ops/formatter/unstructured_formatter/metadata.yml +++ b/runtime/ops/formatter/unstructured_formatter/metadata.yml @@ -1,4 +1,4 @@ -name: '非结构化文本抽取' +name: 'Unstructured文本抽取' name_en: 'Unstructured Text Extraction' description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。' description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.' diff --git a/runtime/python-executor/datamate/common/utils/lazy_loader.py b/runtime/python-executor/datamate/common/utils/lazy_loader.py index 5058f115d..f26a2f27d 100644 --- a/runtime/python-executor/datamate/common/utils/lazy_loader.py +++ b/runtime/python-executor/datamate/common/utils/lazy_loader.py @@ -51,7 +51,7 @@ class LazyLoader(ModuleType): def __init__(self, package_name, module_name=None, - whl_path="/dataset/ops_whl", + whl_path=None, exact_version=None, force_reinstall=False ): @@ -72,7 +72,7 @@ def __init__(self, self._module_name = module_name if module_name else package_name self._package_name = package_name - self.whl_path = Path(whl_path).resolve() + self.whl_path = whl_path self.exact_version = exact_version self.force_reinstall = force_reinstall @@ -126,7 +126,10 @@ def _load_module(self): need_install = True if need_install: - self._pip_install_package(package_name) + if self.whl_path is None: + self._pip_install_package_pypi(package_name) + else: + self._pip_install_package_local(package_name) module = importlib.import_module(module_name) self._cached_module = module self._register_alias(module) @@ -168,13 +171,26 @@ def _get_installed_version(self, package_name): return line.split()[-1] raise PackageNotFoundError() - def _pip_install_package(self, package_name: str): + def _pip_install_package_pypi(self, package_name: str): + if self.exact_version: + package_name += f"=={self.exact_version}" + try: + subprocess.check_call([ + sys.executable, "-m", "pip", "install", str(package_name) + ], stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) + logger.info(f"Successfully installed {package_name}") + except subprocess.CalledProcessError as e: + logger.error(f"Installation failed: {e}") + raise RuntimeError(f"Installation failed: {e}") from e + + def _pip_install_package_local(self, package_name: str): """安装逻辑 """ - if not self.whl_path.exists(): + whl_path = Path(self.whl_path).resolve() + if not whl_path.exists(): raise FileNotFoundError(f"WHL directory not found: {self.whl_path}") - whl_files = list(self.whl_path.glob(f"{package_name}*.whl")) + whl_files = list(whl_path.glob(f"{package_name}*.whl")) if not whl_files: raise RuntimeError(f"No WHL files found for {package_name}") diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index af061a459..a951d43fd 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -70,8 +70,8 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined' INSERT IGNORE INTO t_operator (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star) VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false), - ('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false), - ('ExternalPDFFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false), + ('UnstructuredFormatter', 'Unstructured文本抽取', '基于Unstructured抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false), + ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false), ('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false), ('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'), ('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'), diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile index c0a2f9ab1..bff8c4e26 100644 --- a/scripts/images/runtime/Dockerfile +++ b/scripts/images/runtime/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM ghcr.io/astral-sh/uv:python3.11-bookworm COPY runtime/python-executor /opt/runtime COPY runtime/ops /opt/runtime/datamate/ops @@ -7,16 +7,16 @@ COPY scripts/images/runtime/start.sh /opt/runtime/start.sh ENV PYTHONPATH=/opt/runtime/datamate/ -RUN apt update \ - && apt install -y libgl1 libglib2.0-0 vim libmagic1t64 libreoffice dos2unix \ - && apt clean \ - && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,target=/var/cache/apt \ + --mount=type=cache,target=/var/lib/apt \ + apt update \ + && apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix WORKDIR /opt/runtime -RUN pip install -e . --trusted-host mirrors.huaweicloud.com -i https://mirrors.huaweicloud.com/repository/pypi/simple \ - && pip install -r /opt/runtime/datamate/ops/requirements.txt --trusted-host mirrors.huaweicloud.com -i https://mirrors.huaweicloud.com/repository/pypi/simple \ - && pip cache purge +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install -e . --system \ + && uv pip install -r /opt/runtime/datamate/ops/requirements.txt --system RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ && chmod +x /opt/runtime/start.sh \