Skip to content

Commit 0e9b8ee

Browse files
committed
fix(excel): respect output_format config in XLSX extraction (#405)
ExcelExtractor now uses excel_to_markdown() when output_format is Markdown/Djot/Html, restoring tabular structure lost in eae75bb. Plain and Structured formats continue using excel_to_text() for quality scoring compatibility.
1 parent ae38ba6 commit 0e9b8ee

File tree

2 files changed

+20
-8
lines changed

2 files changed

+20
-8
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2424
- WASM bindings now export `detectMimeFromBytes` and `getExtensionsForMime` MIME utility functions
2525
- Node.js NAPI-RS binding correctly exposes `annotations` field on `ExtractionResult`
2626
- Python output format validation tests updated to reflect `json` as a valid format (alias for `structured`)
27+
- XLSX extraction with `output_format="markdown"` now produces markdown tables instead of plain text (#405)
2728

2829
---
2930

crates/kreuzberg/src/extractors/excel.rs

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
33
use crate::Result;
44
use crate::core::config::ExtractionConfig;
5+
use crate::core::config::formats::OutputFormat;
56
use crate::plugins::{DocumentExtractor, Plugin};
67
use crate::types::{ExcelMetadata, ExtractionResult, Metadata, Table};
78
use ahash::AHashMap;
@@ -76,7 +77,7 @@ impl Plugin for ExcelExtractor {
7677
#[async_trait]
7778
impl DocumentExtractor for ExcelExtractor {
7879
#[cfg_attr(feature = "otel", tracing::instrument(
79-
skip(self, content, _config),
80+
skip(self, content, config),
8081
fields(
8182
extractor.name = self.name(),
8283
content.size_bytes = content.len(),
@@ -86,7 +87,7 @@ impl DocumentExtractor for ExcelExtractor {
8687
&self,
8788
content: &[u8],
8889
mime_type: &str,
89-
_config: &ExtractionConfig,
90+
config: &ExtractionConfig,
9091
) -> Result<ExtractionResult> {
9192
let extension = match mime_type {
9293
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => ".xlsx",
@@ -114,7 +115,12 @@ impl DocumentExtractor for ExcelExtractor {
114115
crate::extraction::excel::read_excel_bytes(content, extension)?
115116
};
116117

117-
let text_content = crate::extraction::excel::excel_to_text(&workbook);
118+
let content = match config.output_format {
119+
OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html => {
120+
crate::extraction::excel::excel_to_markdown(&workbook)
121+
}
122+
_ => crate::extraction::excel::excel_to_text(&workbook),
123+
};
118124
let tables = Self::sheets_to_tables(&workbook);
119125

120126
let sheet_names: Vec<String> = workbook.sheets.iter().map(|s| s.name.clone()).collect();
@@ -131,7 +137,7 @@ impl DocumentExtractor for ExcelExtractor {
131137
}
132138

133139
Ok(ExtractionResult {
134-
content: text_content,
140+
content,
135141
mime_type: mime_type.to_string().into(),
136142
metadata: Metadata {
137143
format: Some(crate::types::FormatMetadata::Excel(excel_metadata)),
@@ -156,18 +162,23 @@ impl DocumentExtractor for ExcelExtractor {
156162
}
157163

158164
#[cfg_attr(feature = "otel", tracing::instrument(
159-
skip(self, path, _config),
165+
skip(self, path, config),
160166
fields(
161167
extractor.name = self.name(),
162168
)
163169
))]
164-
async fn extract_file(&self, path: &Path, mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
170+
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
165171
let path_str = path
166172
.to_str()
167173
.ok_or_else(|| crate::KreuzbergError::validation("Invalid file path".to_string()))?;
168174

169175
let workbook = crate::extraction::excel::read_excel_file(path_str)?;
170-
let text_content = crate::extraction::excel::excel_to_text(&workbook);
176+
let content = match config.output_format {
177+
OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html => {
178+
crate::extraction::excel::excel_to_markdown(&workbook)
179+
}
180+
_ => crate::extraction::excel::excel_to_text(&workbook),
181+
};
171182
let tables = Self::sheets_to_tables(&workbook);
172183

173184
let sheet_names: Vec<String> = workbook.sheets.iter().map(|s| s.name.clone()).collect();
@@ -184,7 +195,7 @@ impl DocumentExtractor for ExcelExtractor {
184195
}
185196

186197
Ok(ExtractionResult {
187-
content: text_content,
198+
content,
188199
mime_type: mime_type.to_string().into(),
189200
metadata: Metadata {
190201
format: Some(crate::types::FormatMetadata::Excel(excel_metadata)),

0 commit comments

Comments
 (0)