fix(pdf): classify headings for structure tree pages with font-size variation (#391)

Goldziher · Goldziher · commit bce9da71347d · 2026-02-24T18:45:12.000+01:00
PDFs where the structure tree tags everything as &lt;P&gt; (e.g. Adobe InDesign)
produced plain text with no headings or bold. The structure tree path
bypassed font-size-based heading classification entirely.

Now, structure tree pages with font size variation but no heading tags are
enriched via K-means font-size clustering in Stage 2/3 of the pipeline.
Bold detection also recognizes fonts with "Bold" in the name when the PDF
doesn't set the font weight descriptor.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- **PDF markdown extraction missing headings/bold for flat structure trees** (#391): PDFs where the structure tree tags everything as `<P>` (common with Adobe InDesign) now produce proper headings and bold text. The structure tree path previously bypassed font-size-based heading classification entirely. Pages with font size variation but no heading tags are now enriched via K-means font-size clustering. Additionally, bold detection now recognizes fonts with "Bold" in the name (e.g. `MyriadPro-Bold`) even when the PDF doesn't set the font weight descriptor.
 - **WASM metadata serialization**: Fixed `#[serde(flatten)]` with internally-tagged enums dropping `format_type` and format-specific metadata fields. Switched from `serde_wasm_bindgen` to `serde_json` + `JSON.parse()` for output serialization.
 - **WASM config deserialization**: Fixed camelCase TypeScript config keys (e.g. `outputFormat`, `extractAnnotations`) not being recognized by Rust serde. Config keys are now converted to snake_case before passing to the WASM boundary.
 - **WASM PDFium module loading**: Fixed `copy-pkg.js` overwriting the real PDFium Emscripten module with a stub init helper. The build script now locates and copies the actual PDFium ESM module (`pdfium.esm.js` + `pdfium.esm.wasm`) from the Cargo build output, with a Deno compatibility fix for bare `import("module")`.
diff --git a/crates/kreuzberg-pdfium-render/src/pdf/document/page/extraction.rs b/crates/kreuzberg-pdfium-render/src/pdf/document/page/extraction.rs
@@ -193,7 +193,8 @@ fn build_mcid_style_map(page: &PdfPage<'_>) -> Result<HashMap<i32, TextStyle>, P
                     w,
                     PdfFontWeight::Weight700Bold | PdfFontWeight::Weight800 | PdfFontWeight::Weight900
                 )
-            }) || font.is_bold_reenforced();
+            }) || font.is_bold_reenforced()
+                || font.name().to_ascii_lowercase().contains("bold");
             let is_italic = font.is_italic();
             let font_size = text_obj.scaled_font_size().value;
 
@@ -400,7 +401,8 @@ fn extract_via_heuristics(page: &PdfPage<'_>) -> Result<PageExtraction, PdfiumEr
                     w,
                     PdfFontWeight::Weight700Bold | PdfFontWeight::Weight800 | PdfFontWeight::Weight900
                 )
-            }) || font.is_bold_reenforced();
+            }) || font.is_bold_reenforced()
+                || font.name().to_ascii_lowercase().contains("bold");
             let is_italic = font.is_italic();
 
             let bounds = object.bounds().ok().map(|qp| qp.to_rect());
diff --git a/crates/kreuzberg-pdfium-render/src/pdf/document/page/text/char.rs b/crates/kreuzberg-pdfium-render/src/pdf/document/page/text/char.rs
@@ -317,7 +317,8 @@ impl<'a> PdfPageTextChar<'a> {
     pub fn font_info(&self) -> (String, bool, bool) {
         let (name, flags) = self.font();
         let name = name.unwrap_or_default();
-        let is_bold = flags.contains(FpdfFontDescriptorFlags::FORCE_BOLD_BIT_19);
+        let is_bold =
+            flags.contains(FpdfFontDescriptorFlags::FORCE_BOLD_BIT_19) || name.to_ascii_lowercase().contains("bold");
         let is_italic = flags.contains(FpdfFontDescriptorFlags::ITALIC_BIT_7);
         (name, is_bold, is_italic)
     }
diff --git a/crates/kreuzberg/src/extractors/pdf/extraction.rs b/crates/kreuzberg/src/extractors/pdf/extraction.rs
@@ -65,6 +65,12 @@ pub(crate) fn extract_all_from_document(
         config.output_format,
         OutputFormat::Markdown | OutputFormat::Djot | OutputFormat::Html
     );
+    tracing::debug!(
+        output_format = ?config.output_format,
+        needs_structured,
+        force_ocr = config.force_ocr,
+        "PDF markdown path: evaluating whether to render structured markdown"
+    );
     let pre_rendered_markdown = if needs_structured && !config.force_ocr {
         let k = config
             .pdf_options
@@ -85,6 +91,10 @@ pub(crate) fn extract_all_from_document(
             .filter(|p| p.insert_page_markers)
             .map(|p| p.marker_format.as_str());
 
+        tracing::debug!(
+            k_clusters = k,
+            "PDF markdown path: calling render_document_as_markdown_with_tables"
+        );
         match crate::pdf::markdown::render_document_as_markdown_with_tables(
             document,
             k,
@@ -93,7 +103,15 @@ pub(crate) fn extract_all_from_document(
             bottom_margin,
             page_marker_format,
         ) {
-            Ok(md) if !md.trim().is_empty() => Some(md),
+            Ok(md) if !md.trim().is_empty() => {
+                tracing::debug!(
+                    md_len = md.len(),
+                    has_headings = md.contains("# "),
+                    has_bold = md.contains("**"),
+                    "PDF markdown path: render succeeded with content"
+                );
+                Some(md)
+            }
             Ok(_) => {
                 tracing::warn!("Markdown rendering produced empty output, will fall back to plain text");
                 None
diff --git a/crates/kreuzberg/src/extractors/pdf/mod.rs b/crates/kreuzberg/src/extractors/pdf/mod.rs
@@ -272,6 +272,13 @@ impl DocumentExtractor for PdfExtractor {
         // so that we can inject image placeholders into it before finalizing the text.
         #[cfg(feature = "pdf")]
         let use_pdf_markdown = !used_ocr && pre_rendered_markdown.is_some();
+        tracing::debug!(
+            used_ocr,
+            has_pre_rendered = pre_rendered_markdown.is_some(),
+            use_pdf_markdown,
+            pre_rendered_len = pre_rendered_markdown.as_ref().map(|m| m.len()).unwrap_or(0),
+            "PDF extractor: deciding whether to use pre-rendered markdown"
+        );
 
         #[cfg(not(feature = "pdf"))]
         let use_pdf_markdown = false;
@@ -393,8 +400,14 @@ impl DocumentExtractor for PdfExtractor {
         // content but still need apply_output_format() for format-specific conversion.
         #[cfg(feature = "pdf")]
         let pre_formatted_output = if used_pdf_markdown && config.output_format == OutputFormat::Markdown {
+            tracing::trace!("PDF extractor: signaling pre-formatted markdown to pipeline");
             Some("markdown".to_string())
         } else {
+            tracing::trace!(
+                used_pdf_markdown,
+                output_format = ?config.output_format,
+                "PDF extractor: NOT signaling pre-formatted markdown"
+            );
             None
         };
         #[cfg(not(feature = "pdf"))]
diff --git a/crates/kreuzberg/src/pdf/markdown/pipeline.rs b/crates/kreuzberg/src/pdf/markdown/pipeline.rs
@@ -30,6 +30,7 @@ pub fn render_document_as_markdown_with_tables(
 ) -> Result<String> {
     let pages = document.pages();
     let page_count = pages.len();
+    tracing::debug!(page_count, "PDF markdown pipeline: starting render");
 
     // Stage 0: Try structure tree extraction for each page.
     let mut struct_tree_results: Vec<Option<Vec<PdfParagraph>>> = Vec::with_capacity(page_count as usize);
@@ -42,6 +43,25 @@ pub fn render_document_as_markdown_with_tables(
 
         match extract_page_content(&page) {
             Ok(extraction) if extraction.method == ExtractionMethod::StructureTree && !extraction.blocks.is_empty() => {
+                tracing::trace!(
+                    page = i,
+                    method = ?extraction.method,
+                    block_count = extraction.blocks.len(),
+                    "PDF markdown pipeline: page extracted via structure tree"
+                );
+                // Log the roles of the first few blocks for debugging
+                for (bi, block) in extraction.blocks.iter().take(10).enumerate() {
+                    tracing::trace!(
+                        page = i,
+                        block_index = bi,
+                        role = ?block.role,
+                        text_preview = &block.text[..block.text.len().min(60)],
+                        font_size = ?block.font_size,
+                        is_bold = block.is_bold,
+                        child_count = block.children.len(),
+                        "PDF markdown pipeline: structure tree block"
+                    );
+                }
                 let page_width = page.width().value;
                 let filtered_blocks = filter_sidebar_blocks(&extraction.blocks, page_width);
                 let mut paragraphs = extracted_blocks_to_paragraphs(&filtered_blocks);
@@ -84,9 +104,31 @@ pub fn render_document_as_markdown_with_tables(
                 // Dehyphenate: structure tree path has no positional data,
                 // so only rejoin explicit trailing hyphens.
                 dehyphenate_paragraphs(&mut paragraphs, false);
+                let heading_count = paragraphs.iter().filter(|p| p.heading_level.is_some()).count();
+                let bold_count = paragraphs.iter().filter(|p| p.is_bold).count();
+                let has_font_variation = has_font_size_variation(&paragraphs);
+                tracing::trace!(
+                    page = i,
+                    paragraph_count = paragraphs.len(),
+                    heading_count,
+                    bold_count,
+                    has_font_variation,
+                    "PDF markdown pipeline: structure tree paragraphs after conversion"
+                );
                 if paragraphs.is_empty() {
                     struct_tree_results.push(None);
                     heuristic_pages.push(i as usize);
+                } else if heading_count == 0 && has_font_variation {
+                    // Structure tree has text with font size variation but no
+                    // heading tags. Add to heuristic extraction for font-size
+                    // clustering data; heading classification will be applied
+                    // to these paragraphs in Stage 3.
+                    tracing::debug!(
+                        page = i,
+                        "PDF markdown pipeline: structure tree has font variation but no headings, will classify via font-size clustering"
+                    );
+                    struct_tree_results.push(Some(paragraphs));
+                    heuristic_pages.push(i as usize);
                 } else {
                     struct_tree_results.push(Some(paragraphs));
                 }
@@ -175,7 +217,26 @@ pub fn render_document_as_markdown_with_tables(
         all_image_positions.extend(image_positions);
     }
 
-    // Stage 2: Global font-size clustering (only for heuristic pages).
+    // Identify structure tree pages that have font size variation but no
+    // heading signals — these need font-size-based heading classification.
+    // Pages with no font variation are left as plain paragraphs (classify
+    // would incorrectly assign headings based on unrelated pages' font data).
+    let struct_tree_needs_classify: std::collections::HashSet<usize> = struct_tree_results
+        .iter()
+        .enumerate()
+        .filter_map(|(i, result)| {
+            result.as_ref().and_then(|paragraphs| {
+                let has_headings = paragraphs.iter().any(|p| p.heading_level.is_some());
+                if !has_headings && has_font_size_variation(paragraphs) {
+                    Some(i)
+                } else {
+                    None
+                }
+            })
+        })
+        .collect();
+
+    // Stage 2: Global font-size clustering (heuristic pages + struct tree pages needing classification).
     let mut all_blocks: Vec<TextBlock> = Vec::new();
     let empty_bbox = BoundingBox {
         left: 0.0,
@@ -195,6 +256,18 @@ pub fn render_document_as_markdown_with_tables(
             });
         }
     }
+    // Include font sizes from struct tree pages that need classification.
+    for &i in &struct_tree_needs_classify {
+        if let Some(paragraphs) = &struct_tree_results[i] {
+            for para in paragraphs {
+                all_blocks.push(TextBlock {
+                    text: String::new(),
+                    bbox: empty_bbox,
+                    font_size: para.dominant_font_size,
+                });
+            }
+        }
+    }
 
     let heading_map = if all_blocks.is_empty() {
         Vec::new()
@@ -206,7 +279,17 @@ pub fn render_document_as_markdown_with_tables(
     // Stage 3: Per-page structured extraction.
     let mut all_page_paragraphs: Vec<Vec<PdfParagraph>> = Vec::with_capacity(page_count as usize);
     for i in 0..page_count as usize {
-        if let Some(paragraphs) = struct_tree_results[i].take() {
+        if let Some(mut paragraphs) = struct_tree_results[i].take() {
+            // Apply heading classification to struct tree pages that have
+            // font size variation but no structure-tree-level headings.
+            if struct_tree_needs_classify.contains(&i) {
+                tracing::debug!(
+                    page = i,
+                    "PDF markdown pipeline: classifying struct tree page via font-size clustering"
+                );
+                classify_paragraphs(&mut paragraphs, &heading_map);
+                merge_continuation_paragraphs(&mut paragraphs);
+            }
             all_page_paragraphs.push(paragraphs);
         } else {
             let lines = segments_to_lines(std::mem::take(&mut all_page_segments[i]));
@@ -245,8 +328,21 @@ pub fn render_document_as_markdown_with_tables(
     // demote numbered section headings when a title H1 is detected.
     refine_heading_hierarchy(&mut all_page_paragraphs);
 
+    let total_paragraphs: usize = all_page_paragraphs.iter().map(|p| p.len()).sum();
+    tracing::debug!(
+        heuristic_page_count = heuristic_pages.len(),
+        total_paragraphs,
+        heading_map_len = heading_map.len(),
+        "PDF markdown pipeline: stage 3 complete, assembling markdown"
+    );
+
     // Stage 4: Assemble markdown with tables interleaved
     let markdown = assemble_markdown_with_tables(all_page_paragraphs, tables, page_marker_format);
+    tracing::debug!(
+        markdown_len = markdown.len(),
+        has_headings = markdown.contains("# "),
+        "PDF markdown pipeline: assembly complete"
+    );
 
     // Stage 5: Inject image placeholders from positions collected during object extraction
     if all_image_positions.is_empty() {
@@ -495,6 +591,26 @@ fn apply_dehyphenation_join(
     }
 }
 
+/// Check if paragraphs have meaningful font size variation.
+///
+/// Returns true if there are at least 2 distinct non-zero font sizes,
+/// indicating that font-size clustering could identify heading candidates.
+fn has_font_size_variation(paragraphs: &[PdfParagraph]) -> bool {
+    let mut first_size: Option<f32> = None;
+    for para in paragraphs {
+        let size = para.dominant_font_size;
+        if size <= 0.0 {
+            continue;
+        }
+        match first_size {
+            None => first_size = Some(size),
+            Some(fs) if (size - fs).abs() > 0.5 => return true,
+            _ => {}
+        }
+    }
+    false
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -695,4 +811,47 @@ mod tests {
         assert_eq!(p.lines[0].segments[1].text, "software");
         assert_eq!(p.lines[1].segments[0].text, "next words");
     }
+
+    // ── has_font_size_variation tests ──
+
+    fn para_with_font_size(font_size: f32) -> PdfParagraph {
+        PdfParagraph {
+            lines: vec![line(vec![seg("text", 0.0, 100.0)])],
+            dominant_font_size: font_size,
+            heading_level: None,
+            is_bold: false,
+            is_list_item: false,
+            is_code_block: false,
+        }
+    }
+
+    #[test]
+    fn test_has_font_size_variation_empty() {
+        assert!(!has_font_size_variation(&[]));
+    }
+
+    #[test]
+    fn test_has_font_size_variation_single_size() {
+        let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(12.0)];
+        assert!(!has_font_size_variation(&paragraphs));
+    }
+
+    #[test]
+    fn test_has_font_size_variation_different_sizes() {
+        let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(18.0)];
+        assert!(has_font_size_variation(&paragraphs));
+    }
+
+    #[test]
+    fn test_has_font_size_variation_small_difference_ignored() {
+        // 0.3pt difference is within 0.5pt tolerance
+        let paragraphs = vec![para_with_font_size(12.0), para_with_font_size(12.3)];
+        assert!(!has_font_size_variation(&paragraphs));
+    }
+
+    #[test]
+    fn test_has_font_size_variation_zero_sizes_ignored() {
+        let paragraphs = vec![para_with_font_size(0.0), para_with_font_size(0.0)];
+        assert!(!has_font_size_variation(&paragraphs));
+    }
 }
diff --git a/crates/kreuzberg/tests/pdf_markdown_extraction.rs b/crates/kreuzberg/tests/pdf_markdown_extraction.rs