Skip to content

Commit d59c167

Browse files
authored
算子将抽取与落盘固定到流程中 (#134)
* feature: 将抽取动作移到每一个算子中 * feature: 落盘算子改为默认执行 * feature: 优化前端展示 * feature: 使用pyproject管理依赖
1 parent 744d15b commit d59c167

File tree

70 files changed

+289
-539
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+289
-539
lines changed

frontend/src/pages/DataCleansing/Create/components/OperatorOrchestration.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ const OperatorFlow: React.FC<OperatorFlowProps> = ({
150150
max={selectedOperators.length}
151151
defaultValue={index + 1}
152152
className="w-10 h-6 text-xs text-center"
153+
style={{ width: 60 }}
153154
autoFocus
154155
onBlur={(e) => handleIndexChange(operator.id, e.target.value)}
155156
onKeyDown={(e) => {

frontend/src/pages/DataCleansing/Detail/components/FileTable.tsx

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -227,19 +227,16 @@ export default function FileTable({result, fetchTaskResult}) {
227227
dataIndex: "status",
228228
key: "status",
229229
filters: [
230-
{ text: "已完成", value: "已完成" },
231-
{ text: "失败", value: "失败" },
232-
{ text: "处理中", value: "处理中" },
230+
{ text: "已完成", value: "COMPLETED" },
231+
{ text: "失败", value: "FAILED" },
233232
],
234233
onFilter: (value: string, record: any) => record.status === value,
235234
render: (status: string) => (
236235
<Badge
237236
status={
238237
status === "COMPLETED"
239238
? "success"
240-
: status === "FAILED"
241-
? "error"
242-
: "processing"
239+
: "error"
243240
}
244241
text={TaskStatusMap[status as TaskStatus].label}
245242
/>
@@ -248,6 +245,7 @@ export default function FileTable({result, fetchTaskResult}) {
248245
{
249246
title: "操作",
250247
key: "action",
248+
width: 200,
251249
render: (_text: string, record: any) => (
252250
<div className="flex">
253251
{record.status === "COMPLETED" ? (

runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(self, *args, **kwargs):
3333

3434
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
3535
start = time.time()
36+
self.read_file_first(sample)
3637
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
3738
sample[self.filename_key])
3839
logger.info(f"fileName: {sample[self.filename_key]}, "

runtime/ops/filter/file_with_high_repeat_word_rate_filter/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def _extract_word(input_data):
3030

3131
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
3232
start = time.time()
33+
self.read_file_first(sample)
3334
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
3435
sample[self.filename_key])
3536
logger.info(f"fileName: {sample[self.filename_key]}, "

runtime/ops/filter/file_with_high_special_char_rate_filter/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def __init__(self, *args, **kwargs):
2626

2727
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
2828
start = time.time()
29+
self.read_file_first(sample)
2930
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
3031
sample[self.filename_key])
3132
logger.info(f"fileName: {sample[self.filename_key]}, "

runtime/ops/filter/img_advertisement_images_cleaner/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def resize_img(self, image):
105105

106106
def execute(self, sample: Dict[str, Any]):
107107
start = time.time()
108+
self.read_file_first(sample)
108109
file_name = sample[self.filename_key]
109110
file_type = "." + sample[self.filetype_key]
110111
img_bytes = sample[self.data_key]

runtime/ops/filter/img_blurred_images_cleaner/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def __init__(self, *args, **kwargs):
2727

2828
def execute(self, sample: Dict[str, Any]):
2929
start = time.time()
30+
self.read_file_first(sample)
3031
img_bytes = sample[self.data_key]
3132
file_name = sample[self.filename_key]
3233
file_type = "." + sample[self.filetype_key]

runtime/ops/filter/img_duplicated_images_cleaner/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def compute_md5(self, img_bytes: bytes) -> str:
6161
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
6262
"""重复图片去重算子执行入口"""
6363
start = time.time()
64+
self.read_file_first(sample)
6465
file_name = sample[self.filename_key]
6566
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
6667
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])

runtime/ops/filter/img_similar_images_cleaner/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ def determine_similar_images(self, file_features: List, p_hash: str, des_matrix:
227227
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
228228
"""去除相似图片算子执行入口"""
229229
start = time.time()
230+
self.read_file_first(sample)
230231
file_name = sample[self.filename_key]
231232
img_bytes = sample[self.data_key]
232233
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])

runtime/ops/filter/remove_duplicate_file/process.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ def determine_similar_text(self, file_features: List, text_minhash: MinHash, fil
150150

151151
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
152152
start = time.time()
153+
self.read_file_first(sample)
153154
file_name = sample[self.filename_key]
154155
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
155156
sample[self.text_key] = self.deduplicate_files(sample, file_name)

0 commit comments

Comments
 (0)