Skip to content

Commit bce9da7

Browse files
committed
fix(pdf): classify headings for structure tree pages with font-size variation (#391)
PDFs where the structure tree tags everything as <P> (e.g. Adobe InDesign) produced plain text with no headings or bold. The structure tree path bypassed font-size-based heading classification entirely. Now, structure tree pages with font size variation but no heading tags are enriched via K-means font-size clustering in Stage 2/3 of the pipeline. Bold detection also recognizes fonts with "Bold" in the name when the PDF doesn't set the font weight descriptor.
1 parent 8938c11 commit bce9da7

File tree

7 files changed

+266
-6
lines changed

7 files changed

+266
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2121

2222
### Fixed
2323

24+
- **PDF markdown extraction missing headings/bold for flat structure trees** (#391): PDFs where the structure tree tags everything as `<P>` (common with Adobe InDesign) now produce proper headings and bold text. The structure tree path previously bypassed font-size-based heading classification entirely. Pages with font size variation but no heading tags are now enriched via K-means font-size clustering. Additionally, bold detection now recognizes fonts with "Bold" in the name (e.g. `MyriadPro-Bold`) even when the PDF doesn't set the font weight descriptor.
2425
- **WASM metadata serialization**: Fixed `#[serde(flatten)]` with internally-tagged enums dropping `format_type` and format-specific metadata fields. Switched from `serde_wasm_bindgen` to `serde_json` + `JSON.parse()` for output serialization.
2526
- **WASM config deserialization**: Fixed camelCase TypeScript config keys (e.g. `outputFormat`, `extractAnnotations`) not being recognized by Rust serde. Config keys are now converted to snake_case before passing to the WASM boundary.
2627
- **WASM PDFium module loading**: Fixed `copy-pkg.js` overwriting the real PDFium Emscripten module with a stub init helper. The build script now locates and copies the actual PDFium ESM module (`pdfium.esm.js` + `pdfium.esm.wasm`) from the Cargo build output, with a Deno compatibility fix for bare `import("module")`.

crates/kreuzberg-pdfium-render/src/pdf/document/page/extraction.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,8 @@ fn build_mcid_style_map(page: &PdfPage<'_>) -> Result<HashMap<i32, TextStyle>, P
193193
w,
194194
PdfFontWeight::Weight700Bold | PdfFontWeight::Weight800 | PdfFontWeight::Weight900
195195
)
196-
}) || font.is_bold_reenforced();
196+
}) || font.is_bold_reenforced()
197+
|| font.name().to_ascii_lowercase().contains("bold");
197198
let is_italic = font.is_italic();
198199
let font_size = text_obj.scaled_font_size().value;
199200

@@ -400,7 +401,8 @@ fn extract_via_heuristics(page: &PdfPage<'_>) -> Result<PageExtraction, PdfiumEr
400401
w,
401402
PdfFontWeight::Weight700Bold | PdfFontWeight::Weight800 | PdfFontWeight::Weight900
402403
)
403-
}) || font.is_bold_reenforced();
404+
}) || font.is_bold_reenforced()
405+
|| font.name().to_ascii_lowercase().contains("bold");
404406
let is_italic = font.is_italic();
405407

406408
let bounds = object.bounds().ok().map(|qp| qp.to_rect());

crates/kreuzberg-pdfium-render/src/pdf/document/page/text/char.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,8 @@ impl<'a> PdfPageTextChar<'a> {
317317
pub fn font_info(&self) -> (String, bool, bool) {
318318
let (name, flags) = self.font();
319319
let name = name.unwrap_or_default();
320-
let is_bold = flags.contains(FpdfFontDescriptorFlags::FORCE_BOLD_BIT_19);
320+
let is_bold =
321+
flags.contains(FpdfFontDescriptorFlags::FORCE_BOLD_BIT_19) || name.to_ascii_lowercase().contains("bold");
321322
let is_italic = flags.contains(FpdfFontDescriptorFlags::ITALIC_BIT_7);
322323
(name, is_bold, is_italic)
323324
}

crates/kreuzberg/src/extractors/pdf/extraction.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ pub(crate) fn extract_all_from_document(
6565
config.output_format,
6666
OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html
6767
);
68+
tracing::debug!(
69+
output_format = ?config.output_format,
70+
needs_structured,
71+
force_ocr = config.force_ocr,
72+
"PDF markdown path: evaluating whether to render structured markdown"
73+
);
6874
let pre_rendered_markdown = if needs_structured && !config.force_ocr {
6975
let k = config
7076
.pdf_options
@@ -85,6 +91,10 @@ pub(crate) fn extract_all_from_document(
8591
.filter(|p| p.insert_page_markers)
8692
.map(|p| p.marker_format.as_str());
8793

94+
tracing::debug!(
95+
k_clusters = k,
96+
"PDF markdown path: calling render_document_as_markdown_with_tables"
97+
);
8898
match crate::pdf::markdown::render_document_as_markdown_with_tables(
8999
document,
90100
k,
@@ -93,7 +103,15 @@ pub(crate) fn extract_all_from_document(
93103
bottom_margin,
94104
page_marker_format,
95105
) {
96-
Ok(md) if !md.trim().is_empty() => Some(md),
106+
Ok(md) if !md.trim().is_empty() => {
107+
tracing::debug!(
108+
md_len = md.len(),
109+
has_headings = md.contains("# "),
110+
has_bold = md.contains("**"),
111+
"PDF markdown path: render succeeded with content"
112+
);
113+
Some(md)
114+
}
97115
Ok(_) => {
98116
tracing::warn!("Markdown rendering produced empty output, will fall back to plain text");
99117
None

crates/kreuzberg/src/extractors/pdf/mod.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,13 @@ impl DocumentExtractor for PdfExtractor {
272272
// so that we can inject image placeholders into it before finalizing the text.
273273
#[cfg(feature = "pdf")]
274274
let use_pdf_markdown = !used_ocr && pre_rendered_markdown.is_some();
275+
tracing::debug!(
276+
used_ocr,
277+
has_pre_rendered = pre_rendered_markdown.is_some(),
278+
use_pdf_markdown,
279+
pre_rendered_len = pre_rendered_markdown.as_ref().map(|m| m.len()).unwrap_or(0),
280+
"PDF extractor: deciding whether to use pre-rendered markdown"
281+
);
275282

276283
#[cfg(not(feature = "pdf"))]
277284
let use_pdf_markdown = false;
@@ -393,8 +400,14 @@ impl DocumentExtractor for PdfExtractor {
393400
// content but still need apply_output_format() for format-specific conversion.
394401
#[cfg(feature = "pdf")]
395402
let pre_formatted_output = if used_pdf_markdown && config.output_format == OutputFormat::Markdown {
403+
tracing::trace!("PDF extractor: signaling pre-formatted markdown to pipeline");
396404
Some("markdown".to_string())
397405
} else {
406+
tracing::trace!(
407+
used_pdf_markdown,
408+
output_format = ?config.output_format,
409+
"PDF extractor: NOT signaling pre-formatted markdown"
410+
);
398411
None
399412
};
400413
#[cfg(not(feature = "pdf"))]

crates/kreuzberg/src/pdf/markdown/pipeline.rs

Lines changed: 161 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ pub fn render_document_as_markdown_with_tables(
3030
) -> Result<String> {
3131
let pages = document.pages();
3232
let page_count = pages.len();
33+
tracing::debug!(page_count, "PDF markdown pipeline: starting render");
3334

3435
// Stage 0: Try structure tree extraction for each page.
3536
let mut struct_tree_results: Vec<Option<Vec<PdfParagraph>>> = Vec::with_capacity(page_count as usize);
@@ -42,6 +43,25 @@ pub fn render_document_as_markdown_with_tables(
4243

4344
match extract_page_content(&page) {
4445
Ok(extraction) if extraction.method == ExtractionMethod::StructureTree && !extraction.blocks.is_empty() => {
46+
tracing::trace!(
47+
page = i,
48+
method = ?extraction.method,
49+
block_count = extraction.blocks.len(),
50+
"PDF markdown pipeline: page extracted via structure tree"
51+
);
52+
// Log the roles of the first few blocks for debugging
53+
for (bi, block) in extraction.blocks.iter().take(10).enumerate() {
54+
tracing::trace!(
55+
page = i,
56+
block_index = bi,
57+
role = ?block.role,
58+
text_preview = &block.text[..block.text.len().min(60)],
59+
font_size = ?block.font_size,
60+
is_bold = block.is_bold,
61+
child_count = block.children.len(),
62+
"PDF markdown pipeline: structure tree block"
63+
);
64+
}
4565
let page_width = page.width().value;
4666
let filtered_blocks = filter_sidebar_blocks(&extraction.blocks, page_width);
4767
let mut paragraphs = extracted_blocks_to_paragraphs(&filtered_blocks);
@@ -84,9 +104,31 @@ pub fn render_document_as_markdown_with_tables(
84104
// Dehyphenate: structure tree path has no positional data,
85105
// so only rejoin explicit trailing hyphens.
86106
dehyphenate_paragraphs(&mut paragraphs, false);
107+
let heading_count = paragraphs.iter().filter(|p| p.heading_level.is_some()).count();
108+
let bold_count = paragraphs.iter().filter(|p| p.is_bold).count();
109+
let has_font_variation = has_font_size_variation(&paragraphs);
110+
tracing::trace!(
111+
page = i,
112+
paragraph_count = paragraphs.len(),
113+
heading_count,
114+
bold_count,
115+
has_font_variation,
116+
"PDF markdown pipeline: structure tree paragraphs after conversion"
117+
);
87118
if paragraphs.is_empty() {
88119
struct_tree_results.push(None);
89120
heuristic_pages.push(i as usize);
121+
} else if heading_count == 0 && has_font_variation {
122+
// Structure tree has text with font size variation but no
123+
// heading tags. Add to heuristic extraction for font-size
124+
// clustering data; heading classification will be applied
125+
// to these paragraphs in Stage 3.
126+
tracing::debug!(
127+
page = i,
128+
"PDF markdown pipeline: structure tree has font variation but no headings, will classify via font-size clustering"
129+
);
130+
struct_tree_results.push(Some(paragraphs));
131+
heuristic_pages.push(i as usize);
90132
} else {
91133
struct_tree_results.push(Some(paragraphs));
92134
}
@@ -175,7 +217,26 @@ pub fn render_document_as_markdown_with_tables(
175217
all_image_positions.extend(image_positions);
176218
}
177219

178-
// Stage 2: Global font-size clustering (only for heuristic pages).
220+
// Identify structure tree pages that have font size variation but no
221+
// heading signals — these need font-size-based heading classification.
222+
// Pages with no font variation are left as plain paragraphs (classify
223+
// would incorrectly assign headings based on unrelated pages' font data).
224+
let struct_tree_needs_classify: std::collections::HashSet<usize> = struct_tree_results
225+
.iter()
226+
.enumerate()
227+
.filter_map(|(i, result)| {
228+
result.as_ref().and_then(|paragraphs| {
229+
let has_headings = paragraphs.iter().any(|p| p.heading_level.is_some());
230+
if !has_headings && has_font_size_variation(paragraphs) {
231+
Some(i)
232+
} else {
233+
None
234+
}
235+
})
236+
})
237+
.collect();
238+
239+
// Stage 2: Global font-size clustering (heuristic pages + struct tree pages needing classification).
179240
let mut all_blocks: Vec<TextBlock> = Vec::new();
180241
let empty_bbox = BoundingBox {
181242
left: 0.0,
@@ -195,6 +256,18 @@ pub fn render_document_as_markdown_with_tables(
195256
});
196257
}
197258
}
259+
// Include font sizes from struct tree pages that need classification.
260+
for &i in &struct_tree_needs_classify {
261+
if let Some(paragraphs) = &struct_tree_results[i] {
262+
for para in paragraphs {
263+
all_blocks.push(TextBlock {
264+
text: String::new(),
265+
bbox: empty_bbox,
266+
font_size: para.dominant_font_size,
267+
});
268+
}
269+
}
270+
}
198271

199272
let heading_map = if all_blocks.is_empty() {
200273
Vec::new()
@@ -206,7 +279,17 @@ pub fn render_document_as_markdown_with_tables(
206279
// Stage 3: Per-page structured extraction.
207280
let mut all_page_paragraphs: Vec<Vec<PdfParagraph>> = Vec::with_capacity(page_count as usize);
208281
for i in 0..page_count as usize {
209-
if let Some(paragraphs) = struct_tree_results[i].take() {
282+
if let Some(mut paragraphs) = struct_tree_results[i].take() {
283+
// Apply heading classification to struct tree pages that have
284+
// font size variation but no structure-tree-level headings.
285+
if struct_tree_needs_classify.contains(&i) {
286+
tracing::debug!(
287+
page = i,
288+
"PDF markdown pipeline: classifying struct tree page via font-size clustering"
289+
);
290+
classify_paragraphs(&mut paragraphs, &heading_map);
291+
merge_continuation_paragraphs(&mut paragraphs);
292+
}
210293
all_page_paragraphs.push(paragraphs);
211294
} else {
212295
let lines = segments_to_lines(std::mem::take(&mut all_page_segments[i]));
@@ -245,8 +328,21 @@ pub fn render_document_as_markdown_with_tables(
245328
// demote numbered section headings when a title H1 is detected.
246329
refine_heading_hierarchy(&mut all_page_paragraphs);
247330

331+
let total_paragraphs: usize = all_page_paragraphs.iter().map(|p| p.len()).sum();
332+
tracing::debug!(
333+
heuristic_page_count = heuristic_pages.len(),
334+
total_paragraphs,
335+
heading_map_len = heading_map.len(),
336+
"PDF markdown pipeline: stage 3 complete, assembling markdown"
337+
);
338+
248339
// Stage 4: Assemble markdown with tables interleaved
249340
let markdown = assemble_markdown_with_tables(all_page_paragraphs, tables, page_marker_format);
341+
tracing::debug!(
342+
markdown_len = markdown.len(),
343+
has_headings = markdown.contains("# "),
344+
"PDF markdown pipeline: assembly complete"
345+
);
250346

251347
// Stage 5: Inject image placeholders from positions collected during object extraction
252348
if all_image_positions.is_empty() {
@@ -495,6 +591,26 @@ fn apply_dehyphenation_join(
495591
}
496592
}
497593

594+
/// Check if paragraphs have meaningful font size variation.
595+
///
596+
/// Returns true if there are at least 2 distinct non-zero font sizes,
597+
/// indicating that font-size clustering could identify heading candidates.
598+
fn has_font_size_variation(paragraphs: &[PdfParagraph]) -> bool {
599+
let mut first_size: Option<f32> = None;
600+
for para in paragraphs {
601+
let size = para.dominant_font_size;
602+
if size <= 0.0 {
603+
continue;
604+
}
605+
match first_size {
606+
None => first_size = Some(size),
607+
Some(fs) if (size - fs).abs() > 0.5 => return true,
608+
_ => {}
609+
}
610+
}
611+
false
612+
}
613+
498614
#[cfg(test)]
499615
mod tests {
500616
use super::*;
@@ -695,4 +811,47 @@ mod tests {
695811
assert_eq!(p.lines[0].segments[1].text, "software");
696812
assert_eq!(p.lines[1].segments[0].text, "next words");
697813
}
814+
815+
// ── has_font_size_variation tests ──
816+
817+
fn para_with_font_size(font_size: f32) -> PdfParagraph {
818+
PdfParagraph {
819+
lines: vec![line(vec![seg("text", 0.0, 100.0)])],
820+
dominant_font_size: font_size,
821+
heading_level: None,
822+
is_bold: false,
823+
is_list_item: false,
824+
is_code_block: false,
825+
}
826+
}
827+
828+
#[test]
829+
fn test_has_font_size_variation_empty() {
830+
assert!(!has_font_size_variation(&[]));
831+
}
832+
833+
#[test]
834+
fn test_has_font_size_variation_single_size() {
835+
let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(12.0)];
836+
assert!(!has_font_size_variation(&paragraphs));
837+
}
838+
839+
#[test]
840+
fn test_has_font_size_variation_different_sizes() {
841+
let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(18.0)];
842+
assert!(has_font_size_variation(&paragraphs));
843+
}
844+
845+
#[test]
846+
fn test_has_font_size_variation_small_difference_ignored() {
847+
// 0.3pt difference is within 0.5pt tolerance
848+
let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(12.3)];
849+
assert!(!has_font_size_variation(&paragraphs));
850+
}
851+
852+
#[test]
853+
fn test_has_font_size_variation_zero_sizes_ignored() {
854+
let paragraphs = vec![para_with_font_size(0.0), para_with_font_size(0.0)];
855+
assert!(!has_font_size_variation(&paragraphs));
856+
}
698857
}

0 commit comments

Comments
 (0)