Skip to content

Commit 119e08b

Browse files
committed
优化excel读取内存占用
1 parent 6141ae9 commit 119e08b

File tree

2 files changed

+100
-53
lines changed

2 files changed

+100
-53
lines changed

app-builder/plugins/aipp-plugin/pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,12 @@
143143
<groupId>org.redisson</groupId>
144144
<artifactId>redisson</artifactId>
145145
</dependency>
146+
<!-- fast excel -->
147+
<dependency>
148+
<groupId>cn.idev.excel</groupId>
149+
<artifactId>fastexcel</artifactId>
150+
<version>1.1.0</version>
151+
</dependency>
146152

147153
<!-- Test -->
148154
<dependency>

app-builder/plugins/aipp-plugin/src/main/java/modelengine/fit/jober/aipp/service/impl/OperatorServiceImpl.java

Lines changed: 94 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,18 @@
66

77
package modelengine.fit.jober.aipp.service.impl;
88

9+
import cn.idev.excel.ExcelReader;
10+
import cn.idev.excel.FastExcel;
11+
import cn.idev.excel.context.AnalysisContext;
12+
import cn.idev.excel.converters.Converter;
13+
import cn.idev.excel.enums.CellDataTypeEnum;
14+
import cn.idev.excel.metadata.GlobalConfiguration;
15+
import cn.idev.excel.metadata.data.DataFormatData;
16+
import cn.idev.excel.metadata.data.ReadCellData;
17+
import cn.idev.excel.metadata.property.ExcelContentProperty;
18+
import cn.idev.excel.read.listener.ReadListener;
19+
import cn.idev.excel.read.metadata.ReadSheet;
20+
import cn.idev.excel.util.DateUtils;
921
import modelengine.fit.jober.aipp.common.exception.AippErrCode;
1022
import modelengine.fit.jober.aipp.common.exception.AippException;
1123
import modelengine.fit.jober.aipp.service.LLMService;
@@ -33,25 +45,14 @@
3345
import org.apache.poi.xwpf.usermodel.XWPFStyles;
3446
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr;
3547

36-
import java.io.BufferedInputStream;
37-
import java.io.File;
38-
import java.io.FileOutputStream;
39-
import java.io.IOException;
40-
import java.io.InputStream;
48+
import java.io.*;
49+
import java.math.BigDecimal;
4150
import java.nio.charset.StandardCharsets;
4251
import java.nio.file.Files;
4352
import java.nio.file.Paths;
4453
import java.text.SimpleDateFormat;
45-
import java.util.ArrayList;
46-
import java.util.Arrays;
47-
import java.util.Collections;
48-
import java.util.Date;
49-
import java.util.EnumMap;
50-
import java.util.Iterator;
51-
import java.util.List;
52-
import java.util.Locale;
53-
import java.util.Objects;
54-
import java.util.Optional;
54+
import java.util.*;
55+
import java.util.concurrent.ConcurrentHashMap;
5556
import java.util.function.Function;
5657
import java.util.stream.Collectors;
5758

@@ -126,22 +127,24 @@ public OperatorServiceImpl(LLMService llmService, BrokerClient client) {
126127
this.client = client;
127128
}
128129

129-
private static String getCellValueAsString(Cell cell) {
130-
switch (cell.getCellType()) {
130+
private static String getCellValueAsString(ReadCellData<?> cell) {
131+
switch (cell.getType()) {
131132
case STRING:
132-
return cell.getStringCellValue();
133-
case NUMERIC:
134-
if (DateUtil.isCellDateFormatted(cell)) {
135-
Date dateCellValue = cell.getDateCellValue();
136-
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
137-
return dateFormat.format(dateCellValue);
133+
return cell.getStringValue();
134+
case NUMBER:
135+
DataFormatData fmt = cell.getDataFormatData();
136+
short formatIndex = fmt.getIndex();
137+
String formatString = fmt.getFormat();
138+
if (DateUtils.isADateFormat(formatIndex,formatString)) {
139+
double value = cell.getNumberValue().doubleValue();
140+
Date date = DateUtils.getJavaDate(value,true);
141+
return new SimpleDateFormat("yyyy-MM-dd").format(date);
138142
} else {
139-
return Double.toString(cell.getNumericCellValue());
143+
BigDecimal num = cell.getNumberValue();
144+
return num.stripTrailingZeros().toPlainString();
140145
}
141146
case BOOLEAN:
142-
return Boolean.toString(cell.getBooleanCellValue());
143-
case FORMULA:
144-
return cell.getCellFormula();
147+
return Boolean.toString(cell.getBooleanValue());
145148
default:
146149
return "";
147150
}
@@ -253,37 +256,53 @@ public String fileExtractor(String fileUrl, Optional<FileType> optionalFileType)
253256
return this.extractTextFile(fileUrl);
254257
}
255258

256-
private String iterExcel(Workbook workbook) {
259+
/**
260+
* 从指定路径的 Excel 文件中提取内容,并返回为字符串形式。
261+
* 实现方式:
262+
* 基于 fast-excel 包,使用流式读取(ReadListener)逐行解析,避免一次性加载整表造成的内存开销。
263+
* 每行数据会被转换为以制表符(\t)分隔的文本,并在行末追加换行符。
264+
* 支持多 sheet 解析,会依次读取工作簿中的每一个 sheet。
265+
*
266+
* @param fileUrl 表示文件路径的 {@link String}.
267+
* @return 表示文件内容的 {@link String}。
268+
* @throws RuntimeException 当文件读取或解析失败时抛出
269+
*/
270+
private String extractExcelFile(String fileUrl) {
271+
File file = Paths.get(fileUrl).toFile();
257272
StringBuilder excelContent = new StringBuilder();
258-
for (int sheetIndex = 0; sheetIndex < workbook.getNumberOfSheets(); sheetIndex++) {
259-
Sheet sheet = workbook.getSheetAt(sheetIndex);
260-
StringBuilder sheetContent = new StringBuilder();
261-
for (Row row : sheet) {
262-
StringBuilder rowContent = new StringBuilder();
263-
Iterator<Cell> cellIterator = row.cellIterator();
264-
while (cellIterator.hasNext()) {
265-
Cell cell = cellIterator.next();
266-
String cellValue = getCellValueAsString(cell);
267-
rowContent.append(cellValue).append("\t");
268-
}
269-
sheetContent.append(rowContent.toString().trim()).append("\n");
273+
ReadListener<Map<Integer, String>> listener = new ReadListener<>() {
274+
@Override
275+
public void invoke(Map<Integer, String> data, AnalysisContext context) {
276+
String line = data.entrySet().stream()
277+
.sorted(Map.Entry.comparingByKey())
278+
.map(e -> e.getValue() == null ? "" : e.getValue())
279+
.collect(Collectors.joining("\t"));
280+
excelContent.append(line).append('\n');
270281
}
271-
excelContent.append("Sheet ").append(sheetIndex + 1).append(":\n").append(sheetContent).append("\n");
272-
}
273-
return excelContent.toString();
274-
}
282+
@Override
283+
public void doAfterAllAnalysed(AnalysisContext context) {
284+
}
285+
};
286+
try (InputStream is = new BufferedInputStream(Files.newInputStream(file.toPath()))) {
287+
ExcelReader reader = FastExcel.read(is, listener)
288+
.registerConverter(new CustomCellStringConverter())
289+
.headRowNumber(0)
290+
.build();
275291

276-
private String extractExcelFile(String fileUrl) {
277-
File file = Paths.get(fileUrl).toFile();
278-
String excelContent = "";
279-
try (InputStream fis = new BufferedInputStream(Files.newInputStream(file.toPath()))) {
280-
Workbook workbook = new XSSFWorkbook(fis);
281-
excelContent = this.iterExcel(workbook);
292+
List<ReadSheet> sheets = reader.excelExecutor().sheetList();
293+
for (ReadSheet meta : sheets) {
294+
excelContent.append("Sheet ").append(meta.getSheetNo() + 1).append(':').append('\n');
295+
ReadSheet readSheet = FastExcel.readSheet(meta.getSheetNo())
296+
.headRowNumber(0)
297+
.build();
298+
reader.read(readSheet);
299+
}
300+
excelContent.append('\n');
301+
reader.finish(); // 关闭资源
282302
} catch (IOException e) {
283-
log.error("read excel fail.", e);
284-
throw new AippException(AippErrCode.EXTRACT_FILE_FAILED);
303+
throw new RuntimeException(e);
285304
}
286-
return excelContent;
305+
return excelContent.toString();
287306
}
288307

289308
private String iterPdf(PDDocument doc) throws IOException {
@@ -339,4 +358,26 @@ private String extractTextFile(String fileUrl) {
339358
throw new AippException(AippErrCode.EXTRACT_FILE_FAILED);
340359
}
341360
}
361+
362+
/**
363+
* 自定义单元格数据转换器。
364+
* 将 Excel 单元格数据统一转换为字符串,避免数值/日期等类型在读取时格式不一致的问题。
365+
* 缺点:由于采用fast excel包,没有 FORMULA类,会将公式单元格自动计算为值
366+
*
367+
*/
368+
public static class CustomCellStringConverter implements Converter<String> {
369+
@Override
370+
public Class<String> supportJavaTypeKey() {
371+
return String.class;
372+
}
373+
@Override
374+
public CellDataTypeEnum supportExcelTypeKey() {
375+
return null;
376+
}
377+
@Override
378+
public String convertToJavaData(ReadCellData<?> cellData, ExcelContentProperty contentProperty,
379+
GlobalConfiguration globalConfiguration) {
380+
return getCellValueAsString(cellData);
381+
}
382+
}
342383
}

0 commit comments

Comments
 (0)