first estimation for title and section

christianabbet · christianabbet · commit 47f15b1c1d3e · 2026-02-19T15:18:32.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -21,6 +21,7 @@ venv/
 minio
 
 # IDE config
+.claude/
 .idea/
 .vscode/*
 !.vscode/launch.json.template.jsonc
diff --git a/main.py b/main.py
@@ -8,9 +8,10 @@
 from dotenv import load_dotenv
 from swissgeol_doc_processing.utils.file_utils import read_params as swissgeol_read_params
 
-from src.boreprofile.entity_parser import document_to_boreprofiles
 from src.classifiers.classifier_factory import ClassifierTypes, create_classifier
 from src.constants import DEFAULT_TREEBASED_MODEL_PATH
+from src.entity.borehole_parser import document_to_boreprofiles
+from src.entity.titlepage_parser import document_to_titlepages
 from src.page_classes import PageClasses
 from src.page_structure import (
     ProcessedEntities,
@@ -34,8 +35,22 @@
 
 
 def setup_mlflow(
-    input_path: Path, ground_truth_path: Path, model_path: str, matching_params: dict, classifier_name: str
+    input_path: Path,
+    matching_params: dict,
+    ground_truth_path: Path | None,
+    model_path: str | None = None,
+    classifier_name: str | None = None,
 ):
+    """Configure MLflow tracking with experiment metadata and git information.
+
+    Args:
+        input_path (Path): Path to input PDF directory.
+        matching_params (dict): Dictionary of matching parameters.
+        ground_truth_path (Path | None): Path to ground truth JSON file, or None to skip.
+        model_path (str | None): Path to pretrained model file, or None to use the default.
+        classifier_name (str | None): Name of the classifier being used, or None if not applicable.
+
+    """
     mlflow.set_experiment("PDF Page Classification")
     mlflow.start_run()
 
@@ -60,7 +75,17 @@ def setup_mlflow(
         logger.warning(f"Could not attach Git metadata to MLflow: {e}")
 
 
-def flatten_dict(d, parent_key="", sep=".") -> dict:
+def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
+    """Flatten a nested dictionary into a single-level dictionary.
+
+    Args:
+        d (dict): Dictionary to flatten.
+        parent_key (str): Parent key prefix for nested keys.
+        sep (str): Separator character for joining keys (default ".").
+
+    Returns:
+        dict: A flattened dictionary with separated keys.
+    """
     items = []
     for k, v in d.items():
         new_key = f"{parent_key}{sep}{k}" if parent_key else k
@@ -182,6 +207,10 @@ def forward_document_entities_group(
     """
     if classification == PageClasses.BOREPROFILE:
         return document_to_boreprofiles(pdf_file=pdf_file, page_start=page_start, page_end=page_end, lang=language)
+    elif classification == PageClasses.TITLE_PAGE or classification == PageClasses.SECTION_HEADER:
+        return document_to_titlepages(
+            pdf_file=pdf_file, classification=classification, page_start=page_start, page_end=page_end, lang=language
+        )
     else:
         return [
             ProcessedEntities(
diff --git a/src/entity/borehole_parser.py b/src/entity/borehole_parser.py
@@ -6,34 +6,14 @@
 
 import pymupdf
 from extraction.runner import extract
-from pymupdf import Document
 
+from src.entity.utils import _select_pages
 from src.page_classes import PageClasses
 from src.page_structure import ProcessedEntities
 
 logger = logging.getLogger(__name__)
 
 
-def _select_pages(pdf_document: Document, page_numbers: list[int]) -> Document:
-    """Select pages from PDF.
-
-    Args:
-        pdf_document (Document): PDF to split.
-        page_numbers (list[int]): List of pages to extract (1-based).
-
-    Returns:
-        Document: Selected subset.
-    """
-    # Create a new PDF for the selected pages
-    select_pdf = pymupdf.open()
-
-    for page_number in page_numbers:
-        # Insert the page into the new PDF
-        select_pdf.insert_pdf(pdf_document, from_page=page_number - 1, to_page=page_number - 1)
-
-    return select_pdf
-
-
 def _find_undetected_pages(
     entities: list[ProcessedEntities],
     page_numbers: list[int],
@@ -106,7 +86,7 @@ def document_to_boreprofiles(
         pdf_file (Path): Path to pdf file.
         page_start (int): Starting page (1-based).
         page_end (int): Ending page (1-based).
-        lang (str): Detected language.
+        lang (str | None): Detected language.
 
     Returns:
         list[ProcessedEntities]: List of boreprofile as entities.
@@ -116,7 +96,7 @@ def document_to_boreprofiles(
 
     # Open the PDF file, select pages and save
     with pymupdf.Document(pdf_file) as doc:
-        pdf_document_select = _select_pages(doc, page_numbers)
+        pdf_document_select = _select_pages(doc, page_start, page_end)
         bytes_document_select = BytesIO(pdf_document_select.tobytes())
 
     # Write file to temp location for inference
diff --git a/src/entity/titlepage_parser.py b/src/entity/titlepage_parser.py
@@ -0,0 +1,140 @@
+"""Convert title / section document to processed entries."""
+
+from dataclasses import dataclass
+from pathlib import Path
+
+import pymupdf
+from pymupdf import Rect
+from swissgeol_doc_processing.text.textblock import TextBlock
+
+from src.entity.utils import _select_pages
+from src.models.feature_engineering import extract_and_cache_page_data
+from src.page_classes import PageClasses
+from src.page_structure import ProcessedEntities
+from src.utils.text_clustering import create_text_blocks
+
+
+@dataclass
+class TitleCandidateTextBlock:
+    """Normalize text block size to document resolution."""
+
+    text: str
+    n_lines: int
+    rect: pymupdf.Rect
+
+    def __init__(self, text_block: TextBlock, rect: Rect):
+        """Create a scale invariant text block.
+
+        The normalized text block is contained in a fictive [0, 0, 1, 1] rect.
+
+        Args:
+            text_block (TextBlock): Input text block.
+            rect (Rect): Size of the page linked to text block.
+        """
+        self.text = text_block.text
+        self.line_count = text_block.line_count
+        self.rect = pymupdf.Rect(
+            text_block.rect.x0 / rect.width,
+            text_block.rect.y0 / rect.height,
+            text_block.rect.x1 / rect.width,
+            text_block.rect.y1 / rect.height,
+        )
+
+    @property
+    def horizontal_centrality(self) -> float:
+        """Horizontal centrality of the block.
+
+        Returns:
+            float: Score in [0, 1] where 1 means the block is perfectly horizontally centered.
+        """
+        return 1 - 2 * abs(0.5 - (self.rect.x1 + self.rect.x0) / 2)
+
+    @property
+    def font(self) -> float:
+        """Normalized font size proxy.
+
+        Returns:
+            float: Normalized line height in [0, 1] coordinate space.
+        """
+        return self.rect.height / self.line_count
+
+    @property
+    def highness(self) -> float:
+        """Vertical position score.
+
+        Higher values for blocks closer to the top of the page.
+
+        Returns:
+            float: Score in [0, 1] where 1 means the block starts at the very top of the page.
+        """
+        return 1 - self.rect.y0
+
+    @property
+    def score(self) -> float:
+        """Combined title-likelihood score.
+
+        The metric is based on horizontal centrality, font size, and vertical position
+
+        Returns:
+            float: Estimated title-likelihood score. Higher means more likely a title.
+        """
+        return self.horizontal_centrality * self.font * self.highness
+
+
+def _extract_title_from_page(page) -> str:
+    """Extract the most likely title string from a single PDF page.
+
+    Builds text blocks from the page's text lines, wraps them as
+    scale-invariant blocks, scores them by title-likelihood, and returns
+    the text of the highest-scoring candidate.
+
+    Args:
+        page (pymupdf.Page): The PDF page to analyse.
+
+    Returns:
+        str: Detected title for the page.
+    """
+    # Extract text block from page
+    extraction_context = extract_and_cache_page_data(page)
+    lines = extraction_context.text_lines
+    text_blocks = create_text_blocks(lines)
+
+    # Create list of text candidates and return best
+    title_candidates = [TitleCandidateTextBlock(text_block=text_block, rect=page.rect) for text_block in text_blocks]
+    title_candidates = sorted(title_candidates, key=lambda x: x.score, reverse=True)
+    return title_candidates[0].text
+
+
+def document_to_titlepages(
+    pdf_file: Path, classification: PageClasses, page_start: int, page_end: int, lang: str | None
+) -> list[ProcessedEntities]:
+    """Extract title or section-header entities from a consecutive page range in a PDF.
+
+    Each page is processed individually and yields one ProcessedEntities entry whose `title` field
+    contains detected title.
+
+    Args:
+        pdf_file (Path): Path to the source PDF file.
+        classification (PageClasses): Page class label to assign.
+        page_start (int): First page index of the group (1-based).
+        page_end (int): Last page index of the group (1-based).
+        lang (str | None): Language code for the page group, or None if unknown.
+
+    Returns:
+        list[ProcessedEntities]: One ProcessedEntities per page, each with its `title`
+            field set to the highest-scoring title candidate extracted from that page.
+    """
+    # Open the PDF file, select pages and save
+    with pymupdf.Document(pdf_file) as doc:
+        pdf_document_select = _select_pages(doc, page_start, page_end)
+
+    return [
+        ProcessedEntities(
+            classification=classification,
+            page_start=page_start,
+            page_end=page_end,
+            language=lang,
+            title=_extract_title_from_page(page=page),
+        )
+        for page in pdf_document_select.pages()
+    ]
diff --git a/src/entity/utils.py b/src/entity/utils.py
@@ -0,0 +1,26 @@
+"""Base utils for entity extraction."""
+
+import pymupdf
+from pymupdf import Document
+
+
+def _select_pages(pdf_document: Document, page_start: int, page_end: int) -> Document:
+    """Select pages from PDF.
+
+    Args:
+        pdf_document (Document): PDF to split.
+        page_start (int): Start page (1-based).
+        page_end (int): End page (1-based).
+
+    Returns:
+        Document: Selected subset.
+    """
+    # Create a new PDF for the selected pages
+    select_pdf = pymupdf.open()
+
+    page_numbers = list(range(page_start, page_end + 1))
+    for page_number in page_numbers:
+        # Insert the page into the new PDF
+        select_pdf.insert_pdf(pdf_document, from_page=page_number - 1, to_page=page_number - 1)
+
+    return select_pdf