Skip to content

Commit 10eae4c

Browse files
committed
部分重构
1 parent 4b522df commit 10eae4c

File tree

6 files changed

+240
-145
lines changed

6 files changed

+240
-145
lines changed

app-builder/plugins/aipp-plugin/src/main/java/modelengine/fit/jober/aipp/service/impl/OperatorServiceImpl.java

Lines changed: 21 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,11 @@
66

77
package modelengine.fit.jober.aipp.service.impl;
88

9-
import cn.idev.excel.ExcelReader;
10-
import cn.idev.excel.FastExcel;
11-
import cn.idev.excel.context.AnalysisContext;
12-
import cn.idev.excel.converters.Converter;
13-
import cn.idev.excel.enums.CellDataTypeEnum;
14-
import cn.idev.excel.metadata.GlobalConfiguration;
15-
import cn.idev.excel.metadata.data.DataFormatData;
16-
import cn.idev.excel.metadata.data.ReadCellData;
17-
import cn.idev.excel.metadata.property.ExcelContentProperty;
18-
import cn.idev.excel.read.listener.ReadListener;
19-
import cn.idev.excel.read.metadata.ReadSheet;
20-
import cn.idev.excel.util.DateUtils;
219
import modelengine.fit.jober.aipp.common.exception.AippErrCode;
2210
import modelengine.fit.jober.aipp.common.exception.AippException;
2311
import modelengine.fit.jober.aipp.service.LLMService;
2412
import modelengine.fit.jober.aipp.service.OperatorService;
13+
import modelengine.fit.jober.aipp.tool.FileExtractorContainer;
2514
import modelengine.fit.jober.aipp.util.AippFileUtils;
2615
import modelengine.fit.jober.aipp.util.AippStringUtils;
2716
import modelengine.fitframework.annotation.Component;
@@ -32,12 +21,7 @@
3221
import org.apache.pdfbox.pdmodel.PDDocument;
3322
import org.apache.pdfbox.text.PDFTextStripper;
3423
import org.apache.poi.poifs.filesystem.FileMagic;
35-
import org.apache.poi.ss.usermodel.Cell;
36-
import org.apache.poi.ss.usermodel.DateUtil;
37-
import org.apache.poi.ss.usermodel.Row;
38-
import org.apache.poi.ss.usermodel.Sheet;
39-
import org.apache.poi.ss.usermodel.Workbook;
40-
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
24+
4125
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
4226
import org.apache.poi.xwpf.usermodel.XWPFDocument;
4327
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
@@ -46,13 +30,10 @@
4630
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr;
4731

4832
import java.io.*;
49-
import java.math.BigDecimal;
5033
import java.nio.charset.StandardCharsets;
5134
import java.nio.file.Files;
5235
import java.nio.file.Paths;
53-
import java.text.SimpleDateFormat;
5436
import java.util.*;
55-
import java.util.concurrent.ConcurrentHashMap;
5637
import java.util.function.Function;
5738
import java.util.stream.Collectors;
5839

@@ -99,7 +80,6 @@ public class OperatorServiceImpl implements OperatorService {
9980
private final LLMService llmService;
10081
private final BrokerClient client;
10182
private final Function<String, String> pdfExtractor = this::extractPdfFile;
102-
private final Function<String, String> excelExtractor = this::extractExcelFile;
10383
private final Function<String, String> wordExtractor = this::extractWordFile;
10484
private final Function<String, String> textExtractor = this::extractTextFile;
10585
private final EnumMap<FileType, Function<File, String>> outlineOperatorMap =
@@ -109,45 +89,24 @@ public class OperatorServiceImpl implements OperatorService {
10989
}
11090
};
11191

112-
private final EnumMap<FileType, Function<String, String>> fileOperatorMap
113-
= new EnumMap<FileType, Function<String, String>>(FileType.class) {
114-
{
115-
put(FileType.PDF, pdfExtractor);
116-
put(FileType.WORD, wordExtractor);
117-
put(FileType.EXCEL, excelExtractor);
118-
put(FileType.TXT, textExtractor);
119-
put(FileType.HTML, textExtractor);
120-
put(FileType.MARKDOWN, textExtractor);
121-
put(FileType.CSV, textExtractor);
122-
}
123-
};
92+
private final EnumMap<FileType, Function<String, String>> fileOperatorMap =
93+
new EnumMap<FileType, Function<String, String>>(FileType.class) {
94+
{
95+
put(FileType.PDF, pdfExtractor);
96+
put(FileType.WORD, wordExtractor);
97+
put(FileType.TXT, textExtractor);
98+
put(FileType.HTML, textExtractor);
99+
put(FileType.MARKDOWN, textExtractor);
100+
put(FileType.CSV, textExtractor);
101+
}
102+
};
103+
private final FileExtractorContainer fileExtractorContainer;
124104

125-
public OperatorServiceImpl(LLMService llmService, BrokerClient client) {
105+
public OperatorServiceImpl(LLMService llmService, BrokerClient client,
106+
FileExtractorContainer fileExtractorContainer) {
126107
this.llmService = llmService;
127108
this.client = client;
128-
}
129-
130-
private static String getCellValueAsString(ReadCellData<?> cell) {
131-
switch (cell.getType()) {
132-
case STRING:
133-
return cell.getStringValue();
134-
case NUMBER:
135-
DataFormatData fmt = cell.getDataFormatData();
136-
short formatIndex = fmt.getIndex();
137-
String formatString = fmt.getFormat();
138-
if (DateUtils.isADateFormat(formatIndex,formatString)) {
139-
double value = cell.getNumberValue().doubleValue();
140-
Date date = DateUtils.getJavaDate(value,true);
141-
return new SimpleDateFormat("yyyy-MM-dd").format(date);
142-
} else {
143-
BigDecimal num = cell.getNumberValue();
144-
return num.stripTrailingZeros().toPlainString();
145-
}
146-
case BOOLEAN:
147-
return Boolean.toString(cell.getBooleanValue());
148-
default:
149-
return "";
150-
}
109+
this.fileExtractorContainer = fileExtractorContainer;
151110
}
152111

153112
private static String extractDocHandle(InputStream fis, String fileName) throws IOException {
@@ -250,61 +209,16 @@ public File createDoc(String instanceId, String fileName, String txt) throws IOE
250209
*/
251210
public String fileExtractor(String fileUrl, Optional<FileType> optionalFileType) {
252211
if (optionalFileType.isPresent()) {
212+
String res = fileExtractorContainer.extract(fileUrl, optionalFileType.get());
213+
if (!res.isEmpty()) {
214+
return res;
215+
}
253216
Function<String, String> function = this.fileOperatorMap.get(optionalFileType.get());
254217
return Optional.ofNullable(function).map(f -> f.apply(fileUrl)).orElse(StringUtils.EMPTY);
255218
}
256219
return this.extractTextFile(fileUrl);
257220
}
258221

259-
/**
260-
* 从指定路径的 Excel 文件中提取内容,并返回为字符串形式。
261-
* 实现方式:
262-
* 基于 fast-excel 包,使用流式读取(ReadListener)逐行解析,避免一次性加载整表造成的内存开销。
263-
* 每行数据会被转换为以制表符(\t)分隔的文本,并在行末追加换行符。
264-
* 支持多 sheet 解析,会依次读取工作簿中的每一个 sheet。
265-
*
266-
* @param fileUrl 表示文件路径的 {@link String}.
267-
* @return 表示文件内容的 {@link String}。
268-
* @throws RuntimeException 当文件读取或解析失败时抛出
269-
*/
270-
private String extractExcelFile(String fileUrl) {
271-
File file = Paths.get(fileUrl).toFile();
272-
StringBuilder excelContent = new StringBuilder();
273-
ReadListener<Map<Integer, String>> listener = new ReadListener<>() {
274-
@Override
275-
public void invoke(Map<Integer, String> data, AnalysisContext context) {
276-
String line = data.entrySet().stream()
277-
.sorted(Map.Entry.comparingByKey())
278-
.map(e -> e.getValue() == null ? "" : e.getValue())
279-
.collect(Collectors.joining("\t"));
280-
excelContent.append(line).append('\n');
281-
}
282-
@Override
283-
public void doAfterAllAnalysed(AnalysisContext context) {
284-
}
285-
};
286-
try (InputStream is = new BufferedInputStream(Files.newInputStream(file.toPath()))) {
287-
ExcelReader reader = FastExcel.read(is, listener)
288-
.registerConverter(new CustomCellStringConverter())
289-
.headRowNumber(0)
290-
.build();
291-
292-
List<ReadSheet> sheets = reader.excelExecutor().sheetList();
293-
for (ReadSheet meta : sheets) {
294-
excelContent.append("Sheet ").append(meta.getSheetNo() + 1).append(':').append('\n');
295-
ReadSheet readSheet = FastExcel.readSheet(meta.getSheetNo())
296-
.headRowNumber(0)
297-
.build();
298-
reader.read(readSheet);
299-
}
300-
excelContent.append('\n');
301-
reader.finish(); // 关闭资源
302-
} catch (IOException e) {
303-
throw new RuntimeException(e);
304-
}
305-
return excelContent.toString();
306-
}
307-
308222
private String iterPdf(PDDocument doc) throws IOException {
309223
int pages = doc.getNumberOfPages();
310224
StringBuilder sb = new StringBuilder();
@@ -359,25 +273,4 @@ private String extractTextFile(String fileUrl) {
359273
}
360274
}
361275

362-
/**
363-
* 自定义单元格数据转换器。
364-
* 将 Excel 单元格数据统一转换为字符串,避免数值/日期等类型在读取时格式不一致的问题。
365-
* 缺点:由于采用fast excel包,没有 FORMULA类,会将公式单元格自动计算为值
366-
*
367-
*/
368-
public static class CustomCellStringConverter implements Converter<String> {
369-
@Override
370-
public Class<String> supportJavaTypeKey() {
371-
return String.class;
372-
}
373-
@Override
374-
public CellDataTypeEnum supportExcelTypeKey() {
375-
return null;
376-
}
377-
@Override
378-
public String convertToJavaData(ReadCellData<?> cellData, ExcelContentProperty contentProperty,
379-
GlobalConfiguration globalConfiguration) {
380-
return getCellValueAsString(cellData);
381-
}
382-
}
383276
}

app-builder/plugins/aipp-plugin/src/main/java/modelengine/fit/jober/aipp/tool/FileExtractor.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,18 @@
66

77
package modelengine.fit.jober.aipp.tool;
88

9+
import modelengine.fel.tool.annotation.Group;
10+
import modelengine.fel.tool.annotation.ToolMethod;
11+
import modelengine.fit.jober.aipp.service.OperatorService;
912
import modelengine.fitframework.annotation.Genericable;
1013

11-
import java.io.File;
12-
1314
/**
1415
* 文件内容提取
1516
*
1617
* @author 孙怡菲
1718
* @since 2024-06-08
1819
*/
20+
@Group(name = "defGroup-aipp-file-extract-tool")
1921
public interface FileExtractor {
2022
/**
2123
* 文件提取genericable接口gid
@@ -25,9 +27,12 @@ public interface FileExtractor {
2527
/**
2628
* 提取文件内容
2729
*
28-
* @param file 待提取的文件
30+
* @param fileUrl 待提取的文件地址
2931
* @return 文件内容。
3032
*/
33+
@ToolMethod(name = "file_extract", description = "提取文件信息")
3134
@Genericable(FILE_EXTRACTOR_GID)
32-
String extractFile(File file);
35+
String extractFile(String fileUrl);
36+
37+
OperatorService.FileType supportedType();
3338
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*---------------------------------------------------------------------------------------------
2+
* Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
3+
* This file is a part of the ModelEngine Project.
4+
* Licensed under the MIT License. See License.txt in the project root for license information.
5+
*--------------------------------------------------------------------------------------------*/
6+
7+
package modelengine.fit.jober.aipp.tool;
8+
9+
import modelengine.fit.jober.aipp.service.OperatorService;
10+
import modelengine.fitframework.annotation.Component;
11+
12+
import java.util.EnumMap;
13+
import java.util.List;
14+
import java.util.Map;
15+
16+
@Component
17+
public class FileExtractorContainer {
18+
private Map<OperatorService.FileType, FileExtractor> map;
19+
20+
public FileExtractorContainer(List<FileExtractor> extractors) {
21+
map = new EnumMap<>(OperatorService.FileType.class);
22+
for (FileExtractor fileExtractor : extractors) {
23+
map.put(fileExtractor.supportedType(), fileExtractor);
24+
}
25+
}
26+
27+
public String extract(String fileUrl, OperatorService.FileType fileType) {
28+
if (map.containsKey(fileType)) {
29+
return map.get(fileType).extractFile(fileUrl);
30+
} else {
31+
return "";//先返回空字符串
32+
}
33+
}
34+
35+
}
36+

app-builder/plugins/aipp-plugin/src/main/java/modelengine/fit/jober/aipp/tool/impl/AudioExtractor.java

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import static modelengine.fit.jober.aipp.constant.AippConstant.NAS_SHARE_DIR;
1010

11+
import modelengine.fel.tool.annotation.Group;
1112
import modelengine.fit.jober.aipp.common.exception.AippErrCode;
1213
import modelengine.fit.jober.aipp.common.exception.AippException;
1314
import modelengine.fit.jober.aipp.dto.audio.AudioSplitInfo;
@@ -16,6 +17,7 @@
1617
import modelengine.fit.jober.aipp.entity.ffmpeg.FfmpegMeta;
1718
import modelengine.fit.jober.aipp.enums.LlmModelNameEnum;
1819
import modelengine.fit.jober.aipp.service.FfmpegService;
20+
import modelengine.fit.jober.aipp.service.OperatorService;
1921
import modelengine.fit.jober.aipp.tool.FileExtractor;
2022
import modelengine.fit.jober.aipp.util.AippFileUtils;
2123
import modelengine.fit.jober.aipp.util.JsonUtils;
@@ -55,7 +57,13 @@
5557
* @since 2024/1/8
5658
*/
5759
@Component
60+
@Group(name = "defGroup-aipp-file-extract-tool-audio")
5861
public class AudioExtractor implements FileExtractor {
62+
@Override
63+
public OperatorService.FileType supportedType() {
64+
return OperatorService.FileType.AUDIO;
65+
}
66+
5967
private static final Logger log = Logger.get(AudioExtractor.class);
6068

6169
private static final String PROMPT = "\nPerform the following actions:\n"
@@ -77,9 +85,8 @@ public class AudioExtractor implements FileExtractor {
7785
private final String pathPrefix;
7886
private final FfmpegService ffmpegService;
7987

80-
public AudioExtractor(FfmpegService ffmpegService, @Fit ChatModel openAiClient,
81-
@Fit VoiceService voiceService, @Value("${app-engine.endpoint}") String endpoint,
82-
@Value("${app-engine.pathPrefix}") String pathPrefix) {
88+
public AudioExtractor(FfmpegService ffmpegService, @Fit ChatModel openAiClient, @Fit VoiceService voiceService,
89+
@Value("${app-engine.endpoint}") String endpoint, @Value("${app-engine.pathPrefix}") String pathPrefix) {
8390
this.ffmpegService = ffmpegService;
8491
this.openAiClient = openAiClient;
8592
this.voiceService = voiceService;
@@ -97,12 +104,14 @@ private SummaryDto batchSummary(List<File> audioList, int segmentSize) throws In
97104
SUMMARY_EXECUTOR.execute(() -> {
98105
try {
99106
File audio = audioList.get(id);
100-
String audioPath = AippFileUtils.getFileDownloadFilePath(
101-
endpoint, this.pathPrefix, audio.getPath());
107+
String audioPath =
108+
AippFileUtils.getFileDownloadFilePath(endpoint, this.pathPrefix, audio.getPath());
102109
log.info("audio filePath: {}, audio fileName: {}", audioPath, audio.getName());
103110
String text = voiceService.getText(audioPath + "&fileName=" + audio.getName());
104-
String summary = LLMUtils.askModelForSummary(openAiClient, String.format(PROMPT, text),
105-
LlmModelNameEnum.QWEN_72B, 16000);
111+
String summary = LLMUtils.askModelForSummary(openAiClient,
112+
String.format(PROMPT, text),
113+
LlmModelNameEnum.QWEN_72B,
114+
16000);
106115
output.set(id, summary);
107116
} catch (IOException e) {
108117
output.set(id, "");
@@ -126,8 +135,10 @@ private SummaryDto generateSummary(List<String> output, int segmentSize) {
126135
StringBuilder sb = new StringBuilder();
127136
summaryDto.getSectionList().forEach(sec -> sb.append(sec.getText()));
128137
try {
129-
String llmOutput = LLMUtils.askModelForSummary(openAiClient, String.format(PROMPT, sb),
130-
LlmModelNameEnum.QWEN_72B, 16000);
138+
String llmOutput = LLMUtils.askModelForSummary(openAiClient,
139+
String.format(PROMPT, sb),
140+
LlmModelNameEnum.QWEN_72B,
141+
16000);
131142
SummarySection section =
132143
JsonUtils.parseObject(LLMUtils.tryFixLlmJsonString(llmOutput), SummarySection.class);
133144
summaryDto.setSummary(section.getText());
@@ -163,8 +174,9 @@ private AudioSplitInfo covertAudioSimple(String dirName, File audio) throws IOEx
163174

164175
@Fitable("llmAudio2Summary")
165176
@Override
166-
public String extractFile(File file) {
177+
public String extractFile(String fileUrl) {
167178
// file -> audioDir 切分为多个音频文件,存在临时目录下
179+
File file = Paths.get(fileUrl).toFile();
168180
String tmpDir = TMP_DIR_PREFIX + UUIDUtil.uuid();
169181
AudioSplitInfo audioSplitInfo;
170182
try {

0 commit comments

Comments
 (0)