|
7 | 7 | 每个自定义算子都需要包含一个 `metadata.yml` 文件: |
8 | 8 |
|
9 | 9 | ```yaml |
10 | | -name: '落盘算子' |
11 | | -name_en: 'save file operator' |
12 | | -description: '将文件内容保存为文件。' |
13 | | -description_en: 'Save the file data as a file.' |
14 | | -language: 'Python' |
15 | | -vendor: 'Huawei' |
16 | | -raw_id: 'FileExporter' |
| 10 | +name: '测试算子' |
| 11 | +description: '这是一个测试算子。' |
| 12 | +language: 'python' |
| 13 | +vendor: 'huawei' |
| 14 | +raw_id: 'TestMapper' |
17 | 15 | version: '1.0.0' |
18 | | -types: |
19 | | - - 'collect' |
20 | | -modal: 'others' |
21 | | -effect: |
22 | | - before: '' |
23 | | - after: '' |
24 | | -inputs: 'all' |
25 | | -outputs: 'all' |
| 16 | +modal: 'text' |
| 17 | +inputs: 'text' |
| 18 | +outputs: 'text' |
26 | 19 | ``` |
27 | 20 |
|
28 | 21 | ### 算子实现 |
29 | 22 |
|
30 | | -创建 `process.py` 文件: |
| 23 | +#### process.py |
31 | 24 |
|
32 | 25 | ```python |
33 | 26 | # -*- coding: utf-8 -*- |
34 | 27 |
|
35 | | -""" |
36 | | -Description: Json文本抽取 |
37 | | -Create: 2024/06/06 15:43 |
38 | | -""" |
39 | | -import time |
40 | | -from loguru import logger |
41 | | -from typing import Dict, Any |
42 | | -
|
| 28 | +# 导入所需数据结构,可以通过以下方式直接导入使用 |
| 29 | +# 提供两种算子类: |
| 30 | +# Mapper用于映射和转换数据,使用时直接修改数据内容 |
43 | 31 | from datamate.core.base_op import Mapper |
44 | 32 |
|
| 33 | +class TestMapper(Mapper): |
| 34 | + def execute(self, sample): |
| 35 | + sample[self.text_key] += "\n新增的数据" |
| 36 | + return sample |
| 37 | + |
| 38 | + |
| 39 | +# Filter用于过滤和选择性保留数据,使用时将需要过滤的数据的text或data置为空值 |
| 40 | +from datamate.core.base_op import Filter |
45 | 41 |
|
46 | | -class TextFormatter(Mapper): |
47 | | - """把输入的json文件流抽取为txt""" |
48 | | -
|
49 | | - def __init__(self, *args, **kwargs): |
50 | | - super(TextFormatter, self).__init__(*args, **kwargs) |
51 | | -
|
52 | | - @staticmethod |
53 | | - def _extract_json(byte_io): |
54 | | - """将默认使用utf-8编码的Json文件流解码,抽取为txt""" |
55 | | - # 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。 |
56 | | - return byte_io.decode("utf-8-sig").replace("\r\n", "\n") |
57 | | -
|
58 | | - def byte_read(self, sample: Dict[str, Any]): |
59 | | - filepath = sample[self.filepath_key] |
60 | | - with open(filepath, "rb") as file: |
61 | | - byte_data = file.read() |
62 | | - sample[self.data_key] = byte_data |
63 | | -
|
64 | | - def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: |
65 | | - start = time.time() |
66 | | - try: |
67 | | - self.byte_read(sample) |
68 | | - sample[self.text_key] = self._extract_json(sample[self.data_key]) |
69 | | - sample[self.data_key] = b"" # 将sample[self.data_key]置空 |
70 | | - logger.info( |
71 | | - f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s") |
72 | | - except UnicodeDecodeError as err: |
73 | | - logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}") |
74 | | - raise |
| 42 | +class TestFilter(Filter): |
| 43 | + def execute(self, sample): |
| 44 | + if len(sample[self.text_key]) > 100: |
| 45 | + sample[self.text_key] += "" |
75 | 46 | return sample |
76 | 47 |
|
77 | 48 | ``` |
78 | 49 |
|
79 | | -创建 `__init__.py` 文件: |
| 50 | +其中,sample的数据结构如下所示: |
| 51 | +```json lines |
| 52 | +// 数据结构 |
| 53 | +{ |
| 54 | + "text": "数据文件的文本内容", |
| 55 | + "data": "多模态文件的内容", |
| 56 | + "fileName": "文件名称", |
| 57 | + "fileType": "文件类型", |
| 58 | + "filePath": "文件路径", |
| 59 | + "fileSize": "文件大小", |
| 60 | + "export_path": "保存的文件路径", |
| 61 | + "extraFileType": "导出的文件类型" |
| 62 | +} |
| 63 | + |
| 64 | +// 数据示例 |
| 65 | +{ |
| 66 | + "text": "text", |
| 67 | + "data": "data", |
| 68 | + "fileName": "test", |
| 69 | + "fileType": "pdf", |
| 70 | + "filePath": "/dataset/test.pdf", |
| 71 | + "fileSize": "100B", |
| 72 | + "export_path": "/dataset/test.txt", |
| 73 | + "extraFileType": "txt" |
| 74 | +} |
| 75 | +``` |
| 76 | + |
| 77 | +#### \_\_init__.py |
80 | 78 |
|
81 | 79 | ```python |
82 | 80 | # -*- coding: utf-8 -*- |
83 | 81 |
|
| 82 | +# 导入OPERATORS用于进行模块注册,可以通过以下方式直接导入使用 |
84 | 83 | from datamate.core.base_op import OPERATORS |
85 | 84 |
|
86 | | -OPERATORS.register_module(module_name='TextFormatter', |
87 | | - module_path="ops.formatter.text_formatter.process") |
| 85 | +# module_name必须填写算子类名称;module_path中须替换模块的算子压缩包名称:python_operator.user.压缩包名.process |
| 86 | +OPERATORS.register_module(module_name='TestMapper', |
| 87 | + module_path="ops.user.test_operator.process") |
88 | 88 |
|
89 | 89 | ``` |
0 commit comments