22
33use crate :: Result ;
44use crate :: core:: config:: ExtractionConfig ;
5+ use crate :: core:: config:: formats:: OutputFormat ;
56use crate :: plugins:: { DocumentExtractor , Plugin } ;
67use crate :: types:: { ExcelMetadata , ExtractionResult , Metadata , Table } ;
78use ahash:: AHashMap ;
@@ -76,7 +77,7 @@ impl Plugin for ExcelExtractor {
7677#[ async_trait]
7778impl DocumentExtractor for ExcelExtractor {
7879 #[ cfg_attr( feature = "otel" , tracing:: instrument(
79- skip( self , content, _config ) ,
80+ skip( self , content, config ) ,
8081 fields(
8182 extractor. name = self . name( ) ,
8283 content. size_bytes = content. len( ) ,
@@ -86,7 +87,7 @@ impl DocumentExtractor for ExcelExtractor {
8687 & self ,
8788 content : & [ u8 ] ,
8889 mime_type : & str ,
89- _config : & ExtractionConfig ,
90+ config : & ExtractionConfig ,
9091 ) -> Result < ExtractionResult > {
9192 let extension = match mime_type {
9293 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => ".xlsx" ,
@@ -114,7 +115,12 @@ impl DocumentExtractor for ExcelExtractor {
114115 crate :: extraction:: excel:: read_excel_bytes ( content, extension) ?
115116 } ;
116117
117- let text_content = crate :: extraction:: excel:: excel_to_text ( & workbook) ;
118+ let content = match config. output_format {
119+ OutputFormat :: Markdown | OutputFormat :: Djot | OutputFormat :: Html => {
120+ crate :: extraction:: excel:: excel_to_markdown ( & workbook)
121+ }
122+ _ => crate :: extraction:: excel:: excel_to_text ( & workbook) ,
123+ } ;
118124 let tables = Self :: sheets_to_tables ( & workbook) ;
119125
120126 let sheet_names: Vec < String > = workbook. sheets . iter ( ) . map ( |s| s. name . clone ( ) ) . collect ( ) ;
@@ -131,7 +137,7 @@ impl DocumentExtractor for ExcelExtractor {
131137 }
132138
133139 Ok ( ExtractionResult {
134- content : text_content ,
140+ content,
135141 mime_type : mime_type. to_string ( ) . into ( ) ,
136142 metadata : Metadata {
137143 format : Some ( crate :: types:: FormatMetadata :: Excel ( excel_metadata) ) ,
@@ -156,18 +162,23 @@ impl DocumentExtractor for ExcelExtractor {
156162 }
157163
158164 #[ cfg_attr( feature = "otel" , tracing:: instrument(
159- skip( self , path, _config ) ,
165+ skip( self , path, config ) ,
160166 fields(
161167 extractor. name = self . name( ) ,
162168 )
163169 ) ) ]
164- async fn extract_file ( & self , path : & Path , mime_type : & str , _config : & ExtractionConfig ) -> Result < ExtractionResult > {
170+ async fn extract_file ( & self , path : & Path , mime_type : & str , config : & ExtractionConfig ) -> Result < ExtractionResult > {
165171 let path_str = path
166172 . to_str ( )
167173 . ok_or_else ( || crate :: KreuzbergError :: validation ( "Invalid file path" . to_string ( ) ) ) ?;
168174
169175 let workbook = crate :: extraction:: excel:: read_excel_file ( path_str) ?;
170- let text_content = crate :: extraction:: excel:: excel_to_text ( & workbook) ;
176+ let content = match config. output_format {
177+ OutputFormat :: Markdown | OutputFormat :: Djot | OutputFormat :: Html => {
178+ crate :: extraction:: excel:: excel_to_markdown ( & workbook)
179+ }
180+ _ => crate :: extraction:: excel:: excel_to_text ( & workbook) ,
181+ } ;
171182 let tables = Self :: sheets_to_tables ( & workbook) ;
172183
173184 let sheet_names: Vec < String > = workbook. sheets . iter ( ) . map ( |s| s. name . clone ( ) ) . collect ( ) ;
@@ -184,7 +195,7 @@ impl DocumentExtractor for ExcelExtractor {
184195 }
185196
186197 Ok ( ExtractionResult {
187- content : text_content ,
198+ content,
188199 mime_type : mime_type. to_string ( ) . into ( ) ,
189200 metadata : Metadata {
190201 format : Some ( crate :: types:: FormatMetadata :: Excel ( excel_metadata) ) ,
0 commit comments