- Celery 节点服务列表
+ {{ t("dataPipelines.celery_node_service_list") }}
@@ -278,18 +278,18 @@
/>
@@ -306,7 +306,7 @@
diff --git a/frontend/src/locales/en_js/datapipelines.js b/frontend/src/locales/en_js/datapipelines.js
index 4454df54c..e0c1ddb39 100644
--- a/frontend/src/locales/en_js/datapipelines.js
+++ b/frontend/src/locales/en_js/datapipelines.js
@@ -13,6 +13,7 @@ export const dataPipelines = {
"targetFormat": "Target Format",
"dataFlowBranch": "Data Flow Branch",
"startExecution": "Start Execution",
+ "inProgress": "In Progress",
"searchTaskName": "Search Task Name",
"confirmTermination": "Confirm Termination",
"terminate": "Terminate",
@@ -39,6 +40,7 @@ export const dataPipelines = {
}
},
"testingConnection": "Testing connection",
+ "submitting": "Submitting",
"pleaseSelectAnExecutionTime": "Please select an execution time",
"deletingTask": "Deleting task",
"terminatingTask": "Terminating task",
@@ -183,14 +185,24 @@ export const dataPipelines = {
"deduplicate": "Deduplicate",
"remove": "Remove",
"data_refine": "Data Refinement",
+ "Internal": "Internal",
"data_generation": "Data Generation",
"data_enhancement": "Data Enhancement",
- "Internal": "Internal",
+
+ "data_source": "Data Source",
+ "execution_completed_normally": "Execution completed (normal)",
+ "execution_end_error": "Execution ended (error)",
+ "stopped": "Stopped",
+ "celery_node_service_list": "Celery Node Service List",
+ "ip_address": "IP Address",
+ "current_number_tasks": "Current Number of Tasks",
+ "node_status": "Node Status",
+ "heartbeat_time": "Heartbeat Time",
+
"taskType": "Task Type",
"dataCleaning": "Data Cleaning",
"processingStatus": "Processing Status",
"processingText": "Processing Text",
- "inProgress": "In Progress",
"completed": "Completed",
"dataSource": "Data Source",
"dataSourceBranch": "Data Source Branch",
@@ -244,7 +256,6 @@ export const dataPipelines = {
"uploadFailedTips2": "The icon size cannot exceed 10MB.",
"uploadFailedTips3": "Upload failed, please try again",
"networkError": "Network error, please check the connection and try again",
- "submitting": "Submitting",
"algorithmTemplateDescription": "The algorithm template allows users to build workflows using various model operators, enabling tasks such as data cleaning, automated data augmentation, and analysis.",
"taskTemplate": "Task Template",
"searchTemplate": "Search Template",
@@ -362,6 +373,9 @@ export const dataPipelines = {
"opencsg_data_extraction_preprocess_internal": "opencsg data extraction preprocess",
"opencsg_scrape_url_data_preprocess_internal": "opencsg scrape url data preprocess",
+ "fineweb_edu_chinese_common_internal": "fineweb edu chinese common",
+ "smoltalk_chinese_common_internal": "smoltalk chinese common",
+ "cosmopedia_chinese_preprocess_internal": "cosmopedia chinese preprocess",
"analysis_common_internal_dec": "This analyzer class is used to analyze specific datasets. It calculates statistics for all filtering operations in the configuration file, applies various analyses (such as overall analysis, column-by-column analysis, etc.) to these statistics, and generates analysis results (statistical tables, distribution charts, etc.) to help users better understand the input dataset.",
@@ -379,4 +393,7 @@ export const dataPipelines = {
"quality_classifier_common_internal_dec": "This quality classifier class is used to predict the scores of documents in the dataset. It will calculate scores for all rows and provide two columns for each row: score and should_keep, to help users decide which row should be deleted. By default, if the score is higher than 0.9, the row will be marked as should_keep=1.",
"opencsg_data_extraction_preprocess_internal_dec": "A high-quality tool for converting PDF to Markdown and JSON",
"opencsg_scrape_url_data_preprocess_internal_dec": "A large language model-based data scraping tool for websites and local documents (XML, HTML, JSON, etc.)",
+ "fineweb_edu_chinese_common_internal_dec": "Users can define their own scoring criteria, score the data from the data source based on these criteria, and filter the data. The maximum score is 5.",
+ "smoltalk_chinese_common_internal_dec": "Use a fixed system_prompt to generate relevant multi-round dialogues with a large model and score them. Filter the data based on the score specified by the user, and only retain the one with the highest score.",
+ "cosmopedia_chinese_preprocess_internal_dec": "A detailed tutorial on converting raw text to WikiHow style using the MakeCosmopediaMapper operator. This tool invokes large language models to generate structured tutorial content based on the input seed text.",
}
\ No newline at end of file
diff --git a/frontend/src/locales/en_js/operator_en.json b/frontend/src/locales/en_js/operator_en.json
index a38877dd9..59621d480 100644
--- a/frontend/src/locales/en_js/operator_en.json
+++ b/frontend/src/locales/en_js/operator_en.json
@@ -1670,5 +1670,106 @@
"after": "The dataset adds embedding, nn_indices, and nn_scores fields containing vector representations of text and nearest neighbor information"
},
"params": []
+ },
+ "gather_generated_data_filter": {
+ "name": "gather_generated_data_filter",
+ "description": "Filter for collecting and processing generated data.",
+ "type": "Filter",
+ "group": "",
+ "samples": {
+ "before": "Based on the results of the previous step, remove the | | and < | im_end | > characters and filter to get the empty content data.",
+ "after": ""
+ },
+ "params": []
+ },
+ "annotate_edu_train_bert_scorer_mapper": {
+ "name": "annotate_edu_train_bert_scorer_mapper",
+ "description": "Annotate Edu Train BERT Scorer",
+ "type": "Filter",
+ "group": "",
+ "samples": {
+ "before": "Here is a more concise translation of the provided sentence:'Score a field and add a _score field for the result.'",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "auth_token",
+ "type": "LIST",
+ "option_values": null,
+ "value": ""
+ },
+ {
+ "name": "model_name",
+ "type": "LIST",
+ "option_values": null,
+ "value": "text-embedding-v4"
+ },
+ {
+ "name": "dimensions",
+ "type": "PositiveFloat",
+ "option_values": null,
+ "value": "1024"
+ },
+ {
+ "name": "model_url",
+ "type": "LIST",
+ "option_values": null,
+ "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ },
+ {
+ "name": "query_text",
+ "type": "LIST",
+ "option_values": null,
+ "value": "What is Deep Learning?"
+ }
+ ]
+ },
+ "dedup_and_save_deduplicator": {
+ "name": "dedup_and_save_deduplicator",
+ "description": "A deduplicator based on graph connectivity. It constructs a similarity graph by connecting samples with similarity scores above the threshold, then keeps only one sample (with minimum index) from each connected component. Suitable for datasets with pre-computed nearest neighbor similarity information.",
+ "type": "Deduplicator",
+ "group": "",
+ "samples": {
+ "before": "",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "similarity_threshold",
+ "type": "PositiveFloat",
+ "option_values": null,
+ "value": 0.5
+ }
+ ]
+ },
+ "pipeline_magpie_zh_mapper": {
+ "name": "pipeline_magpie_zh_mapper",
+ "description": "Using the deepseek-v2.5 or qwen2.5 model, generate multi-round dialogue data based on the manually designed system_prompt corresponding to multiple tasks",
+ "type": "Mapper",
+ "group": "",
+ "samples": {
+ "before": "",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "model_name",
+ "type": "LIST",
+ "option_values": null,
+ "value": "qwen-plus"
+ },
+ {
+ "name": "auth_token",
+ "type": "LIST",
+ "option_values": null,
+ "value": ""
+ },
+ {
+ "name": "model_url",
+ "type": "LIST",
+ "option_values": null,
+ "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ }
+ ]
}
}
\ No newline at end of file
diff --git a/frontend/src/locales/zh_hant_js/datapipelines.js b/frontend/src/locales/zh_hant_js/datapipelines.js
index fa73d668c..e81588c03 100644
--- a/frontend/src/locales/zh_hant_js/datapipelines.js
+++ b/frontend/src/locales/zh_hant_js/datapipelines.js
@@ -18,7 +18,6 @@ export const dataPipelines = {
"terminate": "終止",
"waiting": "等待中",
"error": "錯誤",
- "submitting": "提交中",
"taskStatus": "任務狀態",
"labelStudio": "資料標註",
"dataSourceInfo": {
@@ -43,6 +42,7 @@ export const dataPipelines = {
"pleaseSelectAnExecutionTime": "請選擇執行時間",
"deletingTask": "正在刪除任務",
"terminatingTask": "正在終止任務",
+ "createTask": "創建任務",
"addDataSource": "添加數據源",
"fileFormat": "文件格式",
"connectionStatus": "連接狀態",
@@ -96,7 +96,6 @@ export const dataPipelines = {
"loading": "加載中",
"taskCategories": "任務分類",
"allCategories": "全部分類",
- "createTask": "創建任務",
"taskList": "任務列表",
"taskName": "任務名稱",
"DatabaseName": "數據庫名稱",
@@ -184,9 +183,20 @@ export const dataPipelines = {
"deduplicate": "去重",
"remove": "刪除",
"data_refine": "數據處理",
+ "Internal": "內部",
"data_generation": "數據生成",
"data_enhancement": "數據增強",
- "Internal": "內部",
+
+ "data_source": "數據源",
+ "execution_completed_normally": "執行結束(正常)",
+ "execution_end_error": "執行結束(錯誤)",
+ "stopped": "已停止",
+ "celery_node_service_list": "Celery 節點服務列表",
+ "ip_address": "IP 地址",
+ "current_number_tasks": "當前任務數",
+ "node_status": "節點狀態",
+ "heartbeat_time": "心跳時間",
+
"taskType": "任務類型",
"dataCleaning": "數據清洗",
"processingStatus": "處理狀態",
@@ -245,6 +255,7 @@ export const dataPipelines = {
"uploadFailedTips2": "圖標大小不能超過10MB",
"uploadFailedTips3": "上傳失敗,請重試",
"networkError": "網絡錯誤,請檢查連接後重試",
+ "submitting": "保存中",
"algorithmTemplateDescription": "算法模版可支持用戶使用多種不同的模型算子組成工作流,完成數據清洗、自動數據增強及分析等工作。",
"taskTemplate": "任務模板",
"templateName": "模板名稱",
@@ -361,6 +372,9 @@ export const dataPipelines = {
"quality_classifier_common_internal": "質量分類器通用",
"opencsg_data_extraction_preprocess_internal": "開放計算系統數據提取預處理",
"opencsg_scrape_url_data_preprocess_internal": "開放計算系統抓取 URL 數據預處理",
+ "fineweb_edu_chinese_common_internal": "文本價值評估",
+ "smoltalk_chinese_common_internal": "高質量對話生成",
+ "cosmopedia_chinese_preprocess_internal": "增強文本描述工具",
"analysis_common_internal_dec": "此分析器類用於分析特定數據集。它會為配置文件中的所有過濾操作計算統計數據,對這些統計數據應用多種分析(如整體分析、逐列分析等),並生成分析結果(統計表、分佈圖等),幫助用戶更好地理解輸入數據集。",
"dataset_spliter_by_language_preprocess_internal_dec": "從源目錄加載數據集,然後使用名為 LanguageIDScoreFilter 的操作過濾器進行語言識別,最後按語言分割數據集並保存。",
@@ -377,4 +391,7 @@ export const dataPipelines = {
"quality_classifier_common_internal_dec": "本質量分類器類用於預測數據集中文檔的評分。它將計算所有行的分數,並為每一行提供兩列:分數(score)和是否保留(should_keep),以幫助用戶決定應該刪除哪一行。默認情況下,如果分數高於 0.9,則將該行標記為 should_keep=1。",
"opencsg_data_extraction_preprocess_internal_dec": "一個高質量的工具,用於將 PDF 轉換為 Markdown 和 JSON",
"opencsg_scrape_url_data_preprocess_internal_dec": "基於大型語言模型的網站和本地文檔(XML、HTML、JSON 等)的數據抓取工具",
+ "fineweb_edu_chinese_common_internal_dec": "用戶可以定義自己的評分標準,根據這些標準對數據源中的數據進行評分,並過濾數據。最高分是5分。",
+ "smoltalk_chinese_common_internal_dec": "使用固定的system_prompt生成具有大型模型的相關多輪對話並對其進行評分。根據用戶指定的分數過濾數據,只保留分數最高的數據。",
+ "cosmopedia_chinese_preprocess_internal_dec": "使用MakeCosmopediaMapper操作符將原始文本轉換為WikiHow樣式的詳細教程。此工具調用大型語言模型,根據輸入的種子文本生成結構化教程內容。",
}
\ No newline at end of file
diff --git a/frontend/src/locales/zh_hant_js/operator_zh.json b/frontend/src/locales/zh_hant_js/operator_zh.json
index d4a6a00f8..2845ac58e 100644
--- a/frontend/src/locales/zh_hant_js/operator_zh.json
+++ b/frontend/src/locales/zh_hant_js/operator_zh.json
@@ -1669,5 +1669,106 @@
"after": "数据集增加了embedding、nn_indices和nn_scores字段,包含文本的向量表示和最近邻信息"
},
"params": []
+ },
+ "gather_generated_data_filter": {
+ "name": "数据聚合生成",
+ "description": "用于收集和处理生成数据的过滤器。",
+ "type": "Filter",
+ "group": "",
+ "samples": {
+ "before": "基于前一步结果,除掉 || 与 <|im_end|> 字符并且过滤 出 content 为空的数据",
+ "after": ""
+ },
+ "params": []
+ },
+ "annotate_edu_train_bert_scorer_mapper": {
+ "name": "教学评估打分",
+ "description": "注释Edu火车BERT记分器。",
+ "type": "Filter",
+ "group": "",
+ "samples": {
+ "before": "Here is a more concise translation of the provided sentence:'Score a field and add a _score field for the result.'",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "身份验证令牌",
+ "type": "LIST",
+ "option_values": null,
+ "value": ""
+ },
+ {
+ "name": "模型名称",
+ "type": "LIST",
+ "option_values": null,
+ "value": "text-embedding-v4"
+ },
+ {
+ "name": "尺寸",
+ "type": "PositiveFloat",
+ "option_values": null,
+ "value": "1024"
+ },
+ {
+ "name": "模型网址",
+ "type": "LIST",
+ "option_values": null,
+ "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ },
+ {
+ "name": "查询文本",
+ "type": "LIST",
+ "option_values": null,
+ "value": "What is Deep Learning?"
+ }
+ ]
+ },
+ "dedup_and_save_deduplicator": {
+ "name": "多轮对话生成",
+ "description": "基于图连接性的去重器。它通过连接相似度得分高于阈值的样本来构建相似度图,然后从每个连接的组件中只保留一个样本(具有最小索引)。适用于具有预先计算的最近邻相似性信息的数据集。",
+ "type": "Deduplicator",
+ "group": "",
+ "samples": {
+ "before": "",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "相似阈值",
+ "type": "PositiveFloat",
+ "option_values": null,
+ "value": 0.5
+ }
+ ]
+ },
+ "pipeline_magpie_zh_mapper": {
+ "name": "相似去重",
+ "description": "使用deepseek-v2.5或qwen2.5模型,根据手动设计的与多个任务对应的system_prompt生成多轮对话数据",
+ "type": "Mapper",
+ "group": "",
+ "samples": {
+ "before": "",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "模型名称",
+ "type": "LIST",
+ "option_values": null,
+ "value": "qwen-plus"
+ },
+ {
+ "name": "身份验证令牌",
+ "type": "LIST",
+ "option_values": null,
+ "value": ""
+ },
+ {
+ "name": "模型网址",
+ "type": "LIST",
+ "option_values": null,
+ "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ }
+ ]
}
}
\ No newline at end of file
diff --git a/frontend/src/locales/zh_hant_js/operator_zhHant.json b/frontend/src/locales/zh_hant_js/operator_zhHant.json
index 79081dde3..ad8356a9d 100644
--- a/frontend/src/locales/zh_hant_js/operator_zhHant.json
+++ b/frontend/src/locales/zh_hant_js/operator_zhHant.json
@@ -1669,5 +1669,106 @@
"after": "數據集增加了embedding、nn_indices和nn_scores字段,包含文本的向量表示和最近鄰信息"
},
"params": []
+ },
+ "gather_generated_data_filter": {
+ "name": "數據聚合生成",
+ "description": "用於收集和處理生成數據的過濾器。",
+ "type": "Filter",
+ "group": "",
+ "samples": {
+ "before": "基於前一步結果,除掉 || 與 <|im_end|> 字符並且過濾 出 content 為空的數據",
+ "after": ""
+ },
+ "params": []
+ },
+ "annotate_edu_train_bert_scorer_mapper": {
+ "name": "教學評估打分",
+ "description": "注釋Edu火車BERT記分器。",
+ "type": "Filter",
+ "group": "",
+ "samples": {
+ "before": "Here is a more concise translation of the provided sentence:'Score a field and add a _score field for the result.'",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "身份驗證令牌",
+ "type": "LIST",
+ "option_values": null,
+ "value": ""
+ },
+ {
+ "name": "模型名稱",
+ "type": "LIST",
+ "option_values": null,
+ "value": "text-embedding-v4"
+ },
+ {
+ "name": "尺寸",
+ "type": "PositiveFloat",
+ "option_values": null,
+ "value": "1024"
+ },
+ {
+ "name": "模型網址",
+ "type": "LIST",
+ "option_values": null,
+ "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ },
+ {
+ "name": "查詢文本",
+ "type": "LIST",
+ "option_values": null,
+ "value": "What is Deep Learning?"
+ }
+ ]
+ },
+ "dedup_and_save_deduplicator": {
+ "name": "多輪對話生成",
+ "description": "基於圖連接性的去重器。它通過連接相似度得分高於閾值的樣本來構建相似度圖,然後從每個連接的組件中只保留一個樣本(具有最小索引)。適用於具有預先計算的最近鄰相似性信息的數據集。",
+ "type": "Deduplicator",
+ "group": "",
+ "samples": {
+ "before": "",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "相似閾值",
+ "type": "PositiveFloat",
+ "option_values": null,
+ "value": 0.5
+ }
+ ]
+ },
+ "pipeline_magpie_zh_mapper": {
+ "name": "相似去重",
+ "description": "使用deepseek-v2.5或qwen2.5模型,根據手動設計的與多個任務對應的system_prompt生成多輪對話數據",
+ "type": "Mapper",
+ "group": "",
+ "samples": {
+ "before": "",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "模型名稱",
+ "type": "LIST",
+ "option_values": null,
+ "value": "qwen-plus"
+ },
+ {
+ "name": "身份驗證令牌",
+ "type": "LIST",
+ "option_values": null,
+ "value": ""
+ },
+ {
+ "name": "模型網址",
+ "type": "LIST",
+ "option_values": null,
+ "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ }
+ ]
}
}
\ No newline at end of file
diff --git a/frontend/src/locales/zh_js/datapipelines.js b/frontend/src/locales/zh_js/datapipelines.js
index 72b828a31..845590a38 100644
--- a/frontend/src/locales/zh_js/datapipelines.js
+++ b/frontend/src/locales/zh_js/datapipelines.js
@@ -183,9 +183,20 @@ export const dataPipelines = {
"deduplicate": "去重",
"remove": "删除",
"data_refine": "数据处理",
+ "Internal": "内部",
"data_generation": "数据生成",
"data_enhancement": "数据增强",
- "Internal": "内部",
+
+ "data_source": "数据源",
+ "execution_completed_normally": "执行结束(正常)",
+ "execution_end_error": "执行结束(错误)",
+ "stopped": "已停止",
+ "celery_node_service_list": "Celery 节点服务列表",
+ "ip_address": "IP 地址",
+ "current_number_tasks": "当前任务数",
+ "node_status": "节点状态",
+ "heartbeat_time": "心跳时间",
+
"taskType": "任务类型",
"dataCleaning": "数据清洗",
"processingStatus": "处理状态",
@@ -361,6 +372,9 @@ export const dataPipelines = {
"quality_classifier_common_internal": "质量分类器通用",
"opencsg_data_extraction_preprocess_internal": "开放计算系统数据提取预处理",
"opencsg_scrape_url_data_preprocess_internal": "开放计算系统抓取 URL 数据预处理",
+ "fineweb_edu_chinese_common_internal": "文本价值评估",
+ "smoltalk_chinese_common_internal": "高质量对话生成",
+ "cosmopedia_chinese_preprocess_internal": "增强文本描述工具",
"analysis_common_internal_dec": "此分析器类用于分析特定数据集。它会为配置文件中的所有过滤操作计算统计数据,对这些统计数据应用多种分析(如整体分析、逐列分析等),并生成分析结果(统计表、分布图等),帮助用户更好地理解输入数据集。",
"dataset_spliter_by_language_preprocess_internal_dec": "从源目录加载数据集,然后使用名为 LanguageIDScoreFilter 的操作过滤器进行语言识别,最后按语言分割数据集并保存。",
@@ -377,4 +391,7 @@ export const dataPipelines = {
"quality_classifier_common_internal_dec": "本质量分类器类用于预测数据集中文档的评分。它将计算所有行的分数,并为每一行提供两列:分数(score)和是否保留(should_keep),以帮助用户决定应该删除哪一行。默认情况下,如果分数高于 0.9,则将该行标记为 should_keep=1。",
"opencsg_data_extraction_preprocess_internal_dec": "一个高质量的工具,用于将 PDF 转换为 Markdown 和 JSON",
"opencsg_scrape_url_data_preprocess_internal_dec": "基于大型语言模型的网站和本地文档(XML、HTML、JSON 等)的数据抓取工具",
+ "fineweb_edu_chinese_common_internal_dec": "用户可以定义自己的评分标准,根据这些标准对数据源中的数据进行评分,并过滤数据。最高分是5分。",
+ "smoltalk_chinese_common_internal_dec": "使用固定的system_prompt生成具有大型模型的相关多轮对话并对其进行评分。根据用户指定的分数过滤数据,只保留分数最高的数据。",
+ "cosmopedia_chinese_preprocess_internal_dec": "使用MakeCosmopediaMapper操作符将原始文本转换为WikiHow样式的详细教程。此工具调用大型语言模型,根据输入的种子文本生成结构化教程内容。",
}
diff --git a/frontend/src/locales/zh_js/operator_zh.json b/frontend/src/locales/zh_js/operator_zh.json
index d4a6a00f8..2845ac58e 100644
--- a/frontend/src/locales/zh_js/operator_zh.json
+++ b/frontend/src/locales/zh_js/operator_zh.json
@@ -1669,5 +1669,106 @@
"after": "数据集增加了embedding、nn_indices和nn_scores字段,包含文本的向量表示和最近邻信息"
},
"params": []
+ },
+ "gather_generated_data_filter": {
+ "name": "数据聚合生成",
+ "description": "用于收集和处理生成数据的过滤器。",
+ "type": "Filter",
+ "group": "",
+ "samples": {
+ "before": "基于前一步结果,除掉 || 与 <|im_end|> 字符并且过滤 出 content 为空的数据",
+ "after": ""
+ },
+ "params": []
+ },
+ "annotate_edu_train_bert_scorer_mapper": {
+ "name": "教学评估打分",
+ "description": "注释Edu火车BERT记分器。",
+ "type": "Filter",
+ "group": "",
+ "samples": {
+ "before": "Here is a more concise translation of the provided sentence:'Score a field and add a _score field for the result.'",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "身份验证令牌",
+ "type": "LIST",
+ "option_values": null,
+ "value": ""
+ },
+ {
+ "name": "模型名称",
+ "type": "LIST",
+ "option_values": null,
+ "value": "text-embedding-v4"
+ },
+ {
+ "name": "尺寸",
+ "type": "PositiveFloat",
+ "option_values": null,
+ "value": "1024"
+ },
+ {
+ "name": "模型网址",
+ "type": "LIST",
+ "option_values": null,
+ "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ },
+ {
+ "name": "查询文本",
+ "type": "LIST",
+ "option_values": null,
+ "value": "What is Deep Learning?"
+ }
+ ]
+ },
+ "dedup_and_save_deduplicator": {
+ "name": "多轮对话生成",
+ "description": "基于图连接性的去重器。它通过连接相似度得分高于阈值的样本来构建相似度图,然后从每个连接的组件中只保留一个样本(具有最小索引)。适用于具有预先计算的最近邻相似性信息的数据集。",
+ "type": "Deduplicator",
+ "group": "",
+ "samples": {
+ "before": "",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "相似阈值",
+ "type": "PositiveFloat",
+ "option_values": null,
+ "value": 0.5
+ }
+ ]
+ },
+ "pipeline_magpie_zh_mapper": {
+ "name": "相似去重",
+ "description": "使用deepseek-v2.5或qwen2.5模型,根据手动设计的与多个任务对应的system_prompt生成多轮对话数据",
+ "type": "Mapper",
+ "group": "",
+ "samples": {
+ "before": "",
+ "after": ""
+ },
+ "params": [
+ {
+ "name": "模型名称",
+ "type": "LIST",
+ "option_values": null,
+ "value": "qwen-plus"
+ },
+ {
+ "name": "身份验证令牌",
+ "type": "LIST",
+ "option_values": null,
+ "value": ""
+ },
+ {
+ "name": "模型网址",
+ "type": "LIST",
+ "option_values": null,
+ "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
+ }
+ ]
}
}
\ No newline at end of file