|
6 | 6 |
|
7 | 7 | package modelengine.fit.jober.aipp.service.impl; |
8 | 8 |
|
| 9 | +import cn.idev.excel.ExcelReader; |
| 10 | +import cn.idev.excel.FastExcel; |
| 11 | +import cn.idev.excel.context.AnalysisContext; |
| 12 | +import cn.idev.excel.converters.Converter; |
| 13 | +import cn.idev.excel.enums.CellDataTypeEnum; |
| 14 | +import cn.idev.excel.metadata.GlobalConfiguration; |
| 15 | +import cn.idev.excel.metadata.data.DataFormatData; |
| 16 | +import cn.idev.excel.metadata.data.ReadCellData; |
| 17 | +import cn.idev.excel.metadata.property.ExcelContentProperty; |
| 18 | +import cn.idev.excel.read.listener.ReadListener; |
| 19 | +import cn.idev.excel.read.metadata.ReadSheet; |
| 20 | +import cn.idev.excel.util.DateUtils; |
9 | 21 | import modelengine.fit.jober.aipp.common.exception.AippErrCode; |
10 | 22 | import modelengine.fit.jober.aipp.common.exception.AippException; |
11 | 23 | import modelengine.fit.jober.aipp.service.LLMService; |
|
33 | 45 | import org.apache.poi.xwpf.usermodel.XWPFStyles; |
34 | 46 | import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr; |
35 | 47 |
|
36 | | -import java.io.BufferedInputStream; |
37 | | -import java.io.File; |
38 | | -import java.io.FileOutputStream; |
39 | | -import java.io.IOException; |
40 | | -import java.io.InputStream; |
| 48 | +import java.io.*; |
| 49 | +import java.math.BigDecimal; |
41 | 50 | import java.nio.charset.StandardCharsets; |
42 | 51 | import java.nio.file.Files; |
43 | 52 | import java.nio.file.Paths; |
44 | 53 | import java.text.SimpleDateFormat; |
45 | | -import java.util.ArrayList; |
46 | | -import java.util.Arrays; |
47 | | -import java.util.Collections; |
48 | | -import java.util.Date; |
49 | | -import java.util.EnumMap; |
50 | | -import java.util.Iterator; |
51 | | -import java.util.List; |
52 | | -import java.util.Locale; |
53 | | -import java.util.Objects; |
54 | | -import java.util.Optional; |
| 54 | +import java.util.*; |
| 55 | +import java.util.concurrent.ConcurrentHashMap; |
55 | 56 | import java.util.function.Function; |
56 | 57 | import java.util.stream.Collectors; |
57 | 58 |
|
@@ -126,22 +127,24 @@ public OperatorServiceImpl(LLMService llmService, BrokerClient client) { |
126 | 127 | this.client = client; |
127 | 128 | } |
128 | 129 |
|
129 | | - private static String getCellValueAsString(Cell cell) { |
130 | | - switch (cell.getCellType()) { |
| 130 | + private static String getCellValueAsString(ReadCellData<?> cell) { |
| 131 | + switch (cell.getType()) { |
131 | 132 | case STRING: |
132 | | - return cell.getStringCellValue(); |
133 | | - case NUMERIC: |
134 | | - if (DateUtil.isCellDateFormatted(cell)) { |
135 | | - Date dateCellValue = cell.getDateCellValue(); |
136 | | - SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); |
137 | | - return dateFormat.format(dateCellValue); |
| 133 | + return cell.getStringValue(); |
| 134 | + case NUMBER: |
| 135 | + DataFormatData fmt = cell.getDataFormatData(); |
| 136 | + short formatIndex = fmt.getIndex(); |
| 137 | + String formatString = fmt.getFormat(); |
| 138 | + if (DateUtils.isADateFormat(formatIndex,formatString)) { |
| 139 | + double value = cell.getNumberValue().doubleValue(); |
| 140 | + Date date = DateUtils.getJavaDate(value,true); |
| 141 | + return new SimpleDateFormat("yyyy-MM-dd").format(date); |
138 | 142 | } else { |
139 | | - return Double.toString(cell.getNumericCellValue()); |
| 143 | + BigDecimal num = cell.getNumberValue(); |
| 144 | + return num.stripTrailingZeros().toPlainString(); |
140 | 145 | } |
141 | 146 | case BOOLEAN: |
142 | | - return Boolean.toString(cell.getBooleanCellValue()); |
143 | | - case FORMULA: |
144 | | - return cell.getCellFormula(); |
| 147 | + return Boolean.toString(cell.getBooleanValue()); |
145 | 148 | default: |
146 | 149 | return ""; |
147 | 150 | } |
@@ -253,37 +256,53 @@ public String fileExtractor(String fileUrl, Optional<FileType> optionalFileType) |
253 | 256 | return this.extractTextFile(fileUrl); |
254 | 257 | } |
255 | 258 |
|
256 | | - private String iterExcel(Workbook workbook) { |
| 259 | + /** |
| 260 | + * 从指定路径的 Excel 文件中提取内容,并返回为字符串形式。 |
| 261 | + * 实现方式: |
| 262 | + * 基于 fast-excel 包,使用流式读取(ReadListener)逐行解析,避免一次性加载整表造成的内存开销。 |
| 263 | + * 每行数据会被转换为以制表符(\t)分隔的文本,并在行末追加换行符。 |
| 264 | + * 支持多 sheet 解析,会依次读取工作簿中的每一个 sheet。 |
| 265 | + * |
| 266 | + * @param fileUrl 表示文件路径的 {@link String}. |
| 267 | + * @return 表示文件内容的 {@link String}。 |
| 268 | + * @throws RuntimeException 当文件读取或解析失败时抛出 |
| 269 | + */ |
| 270 | + private String extractExcelFile(String fileUrl) { |
| 271 | + File file = Paths.get(fileUrl).toFile(); |
257 | 272 | StringBuilder excelContent = new StringBuilder(); |
258 | | - for (int sheetIndex = 0; sheetIndex < workbook.getNumberOfSheets(); sheetIndex++) { |
259 | | - Sheet sheet = workbook.getSheetAt(sheetIndex); |
260 | | - StringBuilder sheetContent = new StringBuilder(); |
261 | | - for (Row row : sheet) { |
262 | | - StringBuilder rowContent = new StringBuilder(); |
263 | | - Iterator<Cell> cellIterator = row.cellIterator(); |
264 | | - while (cellIterator.hasNext()) { |
265 | | - Cell cell = cellIterator.next(); |
266 | | - String cellValue = getCellValueAsString(cell); |
267 | | - rowContent.append(cellValue).append("\t"); |
268 | | - } |
269 | | - sheetContent.append(rowContent.toString().trim()).append("\n"); |
| 273 | + ReadListener<Map<Integer, String>> listener = new ReadListener<>() { |
| 274 | + @Override |
| 275 | + public void invoke(Map<Integer, String> data, AnalysisContext context) { |
| 276 | + String line = data.entrySet().stream() |
| 277 | + .sorted(Map.Entry.comparingByKey()) |
| 278 | + .map(e -> e.getValue() == null ? "" : e.getValue()) |
| 279 | + .collect(Collectors.joining("\t")); |
| 280 | + excelContent.append(line).append('\n'); |
270 | 281 | } |
271 | | - excelContent.append("Sheet ").append(sheetIndex + 1).append(":\n").append(sheetContent).append("\n"); |
272 | | - } |
273 | | - return excelContent.toString(); |
274 | | - } |
| 282 | + @Override |
| 283 | + public void doAfterAllAnalysed(AnalysisContext context) { |
| 284 | + } |
| 285 | + }; |
| 286 | + try (InputStream is = new BufferedInputStream(Files.newInputStream(file.toPath()))) { |
| 287 | + ExcelReader reader = FastExcel.read(is, listener) |
| 288 | + .registerConverter(new CustomCellStringConverter()) |
| 289 | + .headRowNumber(0) |
| 290 | + .build(); |
275 | 291 |
|
276 | | - private String extractExcelFile(String fileUrl) { |
277 | | - File file = Paths.get(fileUrl).toFile(); |
278 | | - String excelContent = ""; |
279 | | - try (InputStream fis = new BufferedInputStream(Files.newInputStream(file.toPath()))) { |
280 | | - Workbook workbook = new XSSFWorkbook(fis); |
281 | | - excelContent = this.iterExcel(workbook); |
| 292 | + List<ReadSheet> sheets = reader.excelExecutor().sheetList(); |
| 293 | + for (ReadSheet meta : sheets) { |
| 294 | + excelContent.append("Sheet ").append(meta.getSheetNo() + 1).append(':').append('\n'); |
| 295 | + ReadSheet readSheet = FastExcel.readSheet(meta.getSheetNo()) |
| 296 | + .headRowNumber(0) |
| 297 | + .build(); |
| 298 | + reader.read(readSheet); |
| 299 | + } |
| 300 | + excelContent.append('\n'); |
| 301 | + reader.finish(); // 关闭资源 |
282 | 302 | } catch (IOException e) { |
283 | | - log.error("read excel fail.", e); |
284 | | - throw new AippException(AippErrCode.EXTRACT_FILE_FAILED); |
| 303 | + throw new RuntimeException(e); |
285 | 304 | } |
286 | | - return excelContent; |
| 305 | + return excelContent.toString(); |
287 | 306 | } |
288 | 307 |
|
289 | 308 | private String iterPdf(PDDocument doc) throws IOException { |
@@ -339,4 +358,26 @@ private String extractTextFile(String fileUrl) { |
339 | 358 | throw new AippException(AippErrCode.EXTRACT_FILE_FAILED); |
340 | 359 | } |
341 | 360 | } |
| 361 | + |
| 362 | + /** |
| 363 | + * 自定义单元格数据转换器。 |
| 364 | + * 将 Excel 单元格数据统一转换为字符串,避免数值/日期等类型在读取时格式不一致的问题。 |
| 365 | + * 缺点:由于采用fast excel包,没有 FORMULA类,会将公式单元格自动计算为值 |
| 366 | + * |
| 367 | + */ |
| 368 | + public static class CustomCellStringConverter implements Converter<String> { |
| 369 | + @Override |
| 370 | + public Class<String> supportJavaTypeKey() { |
| 371 | + return String.class; |
| 372 | + } |
| 373 | + @Override |
| 374 | + public CellDataTypeEnum supportExcelTypeKey() { |
| 375 | + return null; |
| 376 | + } |
| 377 | + @Override |
| 378 | + public String convertToJavaData(ReadCellData<?> cellData, ExcelContentProperty contentProperty, |
| 379 | + GlobalConfiguration globalConfiguration) { |
| 380 | + return getCellValueAsString(cellData); |
| 381 | + } |
| 382 | + } |
342 | 383 | } |
0 commit comments