66
77package modelengine .fit .jober .aipp .service .impl ;
88
9- import cn .idev .excel .ExcelReader ;
10- import cn .idev .excel .FastExcel ;
11- import cn .idev .excel .context .AnalysisContext ;
12- import cn .idev .excel .converters .Converter ;
13- import cn .idev .excel .enums .CellDataTypeEnum ;
14- import cn .idev .excel .metadata .GlobalConfiguration ;
15- import cn .idev .excel .metadata .data .DataFormatData ;
16- import cn .idev .excel .metadata .data .ReadCellData ;
17- import cn .idev .excel .metadata .property .ExcelContentProperty ;
18- import cn .idev .excel .read .listener .ReadListener ;
19- import cn .idev .excel .read .metadata .ReadSheet ;
20- import cn .idev .excel .util .DateUtils ;
219import modelengine .fit .jober .aipp .common .exception .AippErrCode ;
2210import modelengine .fit .jober .aipp .common .exception .AippException ;
2311import modelengine .fit .jober .aipp .service .LLMService ;
2412import modelengine .fit .jober .aipp .service .OperatorService ;
13+ import modelengine .fit .jober .aipp .tool .FileExtractorContainer ;
2514import modelengine .fit .jober .aipp .util .AippFileUtils ;
2615import modelengine .fit .jober .aipp .util .AippStringUtils ;
2716import modelengine .fitframework .annotation .Component ;
3221import org .apache .pdfbox .pdmodel .PDDocument ;
3322import org .apache .pdfbox .text .PDFTextStripper ;
3423import org .apache .poi .poifs .filesystem .FileMagic ;
35- import org .apache .poi .ss .usermodel .Cell ;
36- import org .apache .poi .ss .usermodel .DateUtil ;
37- import org .apache .poi .ss .usermodel .Row ;
38- import org .apache .poi .ss .usermodel .Sheet ;
39- import org .apache .poi .ss .usermodel .Workbook ;
40- import org .apache .poi .xssf .usermodel .XSSFWorkbook ;
24+
4125import org .apache .poi .xwpf .extractor .XWPFWordExtractor ;
4226import org .apache .poi .xwpf .usermodel .XWPFDocument ;
4327import org .apache .poi .xwpf .usermodel .XWPFParagraph ;
4630import org .openxmlformats .schemas .wordprocessingml .x2006 .main .CTHdrFtr ;
4731
4832import java .io .*;
49- import java .math .BigDecimal ;
5033import java .nio .charset .StandardCharsets ;
5134import java .nio .file .Files ;
5235import java .nio .file .Paths ;
53- import java .text .SimpleDateFormat ;
5436import java .util .*;
55- import java .util .concurrent .ConcurrentHashMap ;
5637import java .util .function .Function ;
5738import java .util .stream .Collectors ;
5839
@@ -99,7 +80,6 @@ public class OperatorServiceImpl implements OperatorService {
9980 private final LLMService llmService ;
10081 private final BrokerClient client ;
10182 private final Function <String , String > pdfExtractor = this ::extractPdfFile ;
102- private final Function <String , String > excelExtractor = this ::extractExcelFile ;
10383 private final Function <String , String > wordExtractor = this ::extractWordFile ;
10484 private final Function <String , String > textExtractor = this ::extractTextFile ;
10585 private final EnumMap <FileType , Function <File , String >> outlineOperatorMap =
@@ -109,45 +89,24 @@ public class OperatorServiceImpl implements OperatorService {
10989 }
11090 };
11191
112- private final EnumMap <FileType , Function <String , String >> fileOperatorMap
113- = new EnumMap <FileType , Function <String , String >>(FileType .class ) {
114- {
115- put (FileType .PDF , pdfExtractor );
116- put (FileType .WORD , wordExtractor );
117- put (FileType .EXCEL , excelExtractor );
118- put (FileType .TXT , textExtractor );
119- put (FileType .HTML , textExtractor );
120- put (FileType .MARKDOWN , textExtractor );
121- put ( FileType . CSV , textExtractor );
122- }
123- } ;
92+ private final EnumMap <FileType , Function <String , String >> fileOperatorMap =
93+ new EnumMap <FileType , Function <String , String >>(FileType .class ) {
94+ {
95+ put (FileType .PDF , pdfExtractor );
96+ put (FileType .WORD , wordExtractor );
97+ put (FileType .TXT , textExtractor );
98+ put (FileType .HTML , textExtractor );
99+ put (FileType .MARKDOWN , textExtractor );
100+ put (FileType .CSV , textExtractor );
101+ }
102+ };
103+ private final FileExtractorContainer fileExtractorContainer ;
124104
125- public OperatorServiceImpl (LLMService llmService , BrokerClient client ) {
105+ public OperatorServiceImpl (LLMService llmService , BrokerClient client ,
106+ FileExtractorContainer fileExtractorContainer ) {
126107 this .llmService = llmService ;
127108 this .client = client ;
128- }
129-
130- private static String getCellValueAsString (ReadCellData <?> cell ) {
131- switch (cell .getType ()) {
132- case STRING :
133- return cell .getStringValue ();
134- case NUMBER :
135- DataFormatData fmt = cell .getDataFormatData ();
136- short formatIndex = fmt .getIndex ();
137- String formatString = fmt .getFormat ();
138- if (DateUtils .isADateFormat (formatIndex ,formatString )) {
139- double value = cell .getNumberValue ().doubleValue ();
140- Date date = DateUtils .getJavaDate (value ,true );
141- return new SimpleDateFormat ("yyyy-MM-dd" ).format (date );
142- } else {
143- BigDecimal num = cell .getNumberValue ();
144- return num .stripTrailingZeros ().toPlainString ();
145- }
146- case BOOLEAN :
147- return Boolean .toString (cell .getBooleanValue ());
148- default :
149- return "" ;
150- }
109+ this .fileExtractorContainer = fileExtractorContainer ;
151110 }
152111
153112 private static String extractDocHandle (InputStream fis , String fileName ) throws IOException {
@@ -250,61 +209,16 @@ public File createDoc(String instanceId, String fileName, String txt) throws IOE
250209 */
251210 public String fileExtractor (String fileUrl , Optional <FileType > optionalFileType ) {
252211 if (optionalFileType .isPresent ()) {
212+ String res = fileExtractorContainer .extract (fileUrl , optionalFileType .get ());
213+ if (!res .isEmpty ()) {
214+ return res ;
215+ }
253216 Function <String , String > function = this .fileOperatorMap .get (optionalFileType .get ());
254217 return Optional .ofNullable (function ).map (f -> f .apply (fileUrl )).orElse (StringUtils .EMPTY );
255218 }
256219 return this .extractTextFile (fileUrl );
257220 }
258221
259- /**
260- * 从指定路径的 Excel 文件中提取内容,并返回为字符串形式。
261- * 实现方式:
262- * 基于 fast-excel 包,使用流式读取(ReadListener)逐行解析,避免一次性加载整表造成的内存开销。
263- * 每行数据会被转换为以制表符(\t)分隔的文本,并在行末追加换行符。
264- * 支持多 sheet 解析,会依次读取工作簿中的每一个 sheet。
265- *
266- * @param fileUrl 表示文件路径的 {@link String}.
267- * @return 表示文件内容的 {@link String}。
268- * @throws RuntimeException 当文件读取或解析失败时抛出
269- */
270- private String extractExcelFile (String fileUrl ) {
271- File file = Paths .get (fileUrl ).toFile ();
272- StringBuilder excelContent = new StringBuilder ();
273- ReadListener <Map <Integer , String >> listener = new ReadListener <>() {
274- @ Override
275- public void invoke (Map <Integer , String > data , AnalysisContext context ) {
276- String line = data .entrySet ().stream ()
277- .sorted (Map .Entry .comparingByKey ())
278- .map (e -> e .getValue () == null ? "" : e .getValue ())
279- .collect (Collectors .joining ("\t " ));
280- excelContent .append (line ).append ('\n' );
281- }
282- @ Override
283- public void doAfterAllAnalysed (AnalysisContext context ) {
284- }
285- };
286- try (InputStream is = new BufferedInputStream (Files .newInputStream (file .toPath ()))) {
287- ExcelReader reader = FastExcel .read (is , listener )
288- .registerConverter (new CustomCellStringConverter ())
289- .headRowNumber (0 )
290- .build ();
291-
292- List <ReadSheet > sheets = reader .excelExecutor ().sheetList ();
293- for (ReadSheet meta : sheets ) {
294- excelContent .append ("Sheet " ).append (meta .getSheetNo () + 1 ).append (':' ).append ('\n' );
295- ReadSheet readSheet = FastExcel .readSheet (meta .getSheetNo ())
296- .headRowNumber (0 )
297- .build ();
298- reader .read (readSheet );
299- }
300- excelContent .append ('\n' );
301- reader .finish (); // 关闭资源
302- } catch (IOException e ) {
303- throw new RuntimeException (e );
304- }
305- return excelContent .toString ();
306- }
307-
308222 private String iterPdf (PDDocument doc ) throws IOException {
309223 int pages = doc .getNumberOfPages ();
310224 StringBuilder sb = new StringBuilder ();
@@ -359,25 +273,4 @@ private String extractTextFile(String fileUrl) {
359273 }
360274 }
361275
362- /**
363- * 自定义单元格数据转换器。
364- * 将 Excel 单元格数据统一转换为字符串,避免数值/日期等类型在读取时格式不一致的问题。
365- * 缺点:由于采用fast excel包,没有 FORMULA类,会将公式单元格自动计算为值
366- *
367- */
368- public static class CustomCellStringConverter implements Converter <String > {
369- @ Override
370- public Class <String > supportJavaTypeKey () {
371- return String .class ;
372- }
373- @ Override
374- public CellDataTypeEnum supportExcelTypeKey () {
375- return null ;
376- }
377- @ Override
378- public String convertToJavaData (ReadCellData <?> cellData , ExcelContentProperty contentProperty ,
379- GlobalConfiguration globalConfiguration ) {
380- return getCellValueAsString (cellData );
381- }
382- }
383276}
0 commit comments