@@ -68,6 +68,7 @@ VALUES (1, '模态', 'predefined', 0),
6868INSERT IGNORE INTO t_operator
6969(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
7070VALUES (' TextFormatter' , ' TXT文本抽取' , ' 抽取TXT中的文本。' , ' 1.0.0' , ' text' , ' text' , null , null , ' ' , false),
71+ (' UnstructuredFormatter' , ' 非结构化文本抽取' , ' 抽取非结构化文件的文本,目前支持word文档。' , ' 1.0.0' , ' text' , ' text' , null , null , ' ' , false),
7172 (' FileExporter' , ' 落盘算子' , ' 将文件保存到本地目录。' , ' 1.0.0' , ' all' , ' all' , null , null , ' ' , false),
7273 (' FileWithHighRepeatPhraseRateFilter' , ' 文档词重复率检查' , ' 去除重复词过多的文档。' , ' 1.0.0' , ' text' , ' text' , null , ' {"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}' , ' ' , ' false' ),
7374 (' FileWithHighRepeatWordRateFilter' , ' 文档字重复率检查' , ' 去除重复字过多的文档。' , ' 1.0.0' , ' text' , ' text' , null , ' {"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}' , ' ' , ' false' ),
@@ -121,7 +122,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe
121122 ' AnonymizedIpAddress' , ' AnonymizedPhoneNumber' , ' AnonymizedUrlCleaner' , ' HtmlTagCleaner' , ' XMLTagCleaner' ,
122123 ' ContentCleaner' , ' EmailNumberCleaner' , ' EmojiCleaner' , ' ExtraSpaceCleaner' , ' FullWidthCharacterCleaner' ,
123124 ' GrableCharactersCleaner' , ' InvisibleCharactersCleaner' , ' LegendCleaner' , ' PoliticalWordCleaner' ,
124- ' SexualAndViolentWordCleaner' , ' TraditionalChineseCleaner' , ' UnicodeSpaceCleaner' );
125+ ' SexualAndViolentWordCleaner' , ' TraditionalChineseCleaner' , ' UnicodeSpaceCleaner' , ' UnstructuredFormatter ' );
125126
126127INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
127128SELECT c .id , o .id
@@ -137,4 +138,4 @@ SELECT c.id, o.id
137138FROM t_operator_category c
138139 CROSS JOIN t_operator o
139140WHERE c .id IN (7 , 8 , 11 )
140- AND o .id IN (' FileExporter' );
141+ AND o .id IN (' FileExporter' , ' UnstructuredFormatter ' );
0 commit comments