typo check with llm

christianabbet · christianabbet · commit bd33b3745dd3 · 2026-03-05T14:51:15.000+01:00
diff --git a/main.py b/main.py
@@ -331,10 +331,10 @@ def main(
     if mlflow_tracking:
         mlflow.end_run()
 
+    # Reclassify section header pages using the label of their following page
     documents_pages = [reclassify_section_headers(doc) for doc in documents_pages]
 
     if not return_entities:
-        # Reclassify section header pages using the label of their following page
         return documents_pages
     else:
         entities = forward_document_entities(documents=documents_pages)
diff --git a/src/classifiers/pixtral_classifier.py b/src/classifiers/pixtral_classifier.py
@@ -54,7 +54,7 @@ class PixtralMessage(BaseModel):
 
     @model_validator(mode="after")
     def at_least_one_field(self):
-        """Ensure at least on field (text, image) is present."""
+        """Ensure at least one field (text, image) is present."""
         if self.text is None and self.image is None:
             raise ValueError("PixtralMessage must have either 'text' or 'image'")
         return self
@@ -359,7 +359,7 @@ def _build_conversation(self, text: str, image_bytes: bytes) -> PixtralMessageSt
         """Build a minimal user message containing only a text and the target page image.
 
         Args:
-            text: Text provided along with image.
+            text (str): Text provided along with the image.
             image_bytes (bytes): Encoded bytes of the page to process.
 
         Returns:
@@ -382,7 +382,7 @@ def find(self, text: str, page: pymupdf.Page) -> str:
         """Extract a feature from a single PDF page using the Pixtral model.
 
         Args:
-            text (str): Text provided along of image
+            text (str): Text provided along with the image.
             page (pymupdf.Page): The PyMuPDF page object to process.
 
         Returns:
diff --git a/src/entity/titlepage_parser.py b/src/entity/titlepage_parser.py
@@ -1,5 +1,6 @@
 """Convert title / section document to processed entries."""
 
+import re
 from dataclasses import dataclass
 
 import pymupdf
@@ -8,7 +9,6 @@
 
 from src.models.feature_engineering import extract_and_cache_page_data
 from src.utils.text_clustering import create_text_blocks
-from src.utils.utility import standardize_text
 
 
 @dataclass
@@ -38,66 +38,53 @@ def __init__(self, text_block: TextBlock, rect: Rect):
         )
 
     @property
-    def contains_keywords(self) -> int:
-        """Score item if it contains a keyword.
-
-        Returns:
-            int: 1 if keywords found, 0 otherwise.
-        """
-        std_text = standardize_text(self.text)
-        return int(any([keyword in std_text for keyword in ["bericht", "etude"]]))
+    def length(self) -> float:
+        """Return True if the text contains more than 5 characters."""
+        return float(len(self.text) > 5)
 
     @property
-    def horizontal_centrality(self) -> float:
-        """Horizontal centrality of the block.
+    def horizontality(self) -> float:
+        """Return True if the block starts in the left 40% of the page width."""
+        return float(self.rect.x0 < 0.4)
 
-        Returns:
-            float: Score in [0, 1] where 1 means the block is perfectly horizontally centered.
-        """
-        return 1 - 2 * abs(0.5 - (self.rect.x1 + self.rect.x0) / 2)
+    @property
+    def verticality(self) -> float:
+        """Return True if the block ends in the upper 75% of the page height."""
+        return float(self.rect.y1 < 0.75)
 
     @property
-    def horizontal_leftness(self) -> float:
-        """Horizontal leftness score of the block.
+    def non_numericality(self) -> float:
+        """Return the fraction of non-digit characters in the text.
 
         Returns:
-            float: Score in [0, 1] where higher values indicate left position.
+            float: Value in [0, 1]; 1.0 means no digits, 0.0 means all digits.
         """
-        return min(1, 2 - (self.rect.x1 + self.rect.x0))
+        n_digits = len(re.findall(r"\d", self.text))
+        n_total = len(self.text)
+        return 1 - (n_digits / max(n_total, 1))
 
     @property
     def font(self) -> float:
-        """Normalized font size proxy.
-
-        Returns:
-            float: Normalized line height in [0, 1] coordinate space.
-        """
+        """Return an approximate normalised font size (block height per line)."""
         return self.rect.height / max(self.line_count, 1)
 
     @property
     def highness(self) -> float:
-        """Vertical position score.
-
-        Higher values for blocks closer to the top of the page.
-
-        Returns:
-            float: Score in [0, 1] where 1 means the block starts at the very top of the page.
-        """
+        """Return a score favouring blocks near the top of the page."""
         return 1 - self.rect.y0
 
     @property
     def score(self) -> float:
-        """Combined title-likelihood score.
+        """Return a composite title-likelihood score.
 
-        The metric is based on horizontal centrality, font size, and vertical position.
+        Multiplies all heuristic signals: font size, horizontal position,
+        vertical position, text length, non-numericality, and highness.
+        A higher score indicates a stronger title candidate.
 
         Returns:
-            float: Estimated title-likelihood score. Higher means more likely a title.
+            float: Non-negative composite score; 0 if any signal is False/zero.
         """
-        # TODO improve metric
-        # return (self.horizontal_centrality * self.font * self.highness) + self.contains_keywords
-        # return self.horizontal_centrality * self.font * self.highness
-        return self.font
+        return self.font * self.horizontality * self.verticality * self.length * self.non_numericality * self.highness
 
 
 def extract_title_from_page(page: pymupdf.Page) -> str:
diff --git a/src/entity/utils.py b/src/entity/utils.py
@@ -18,14 +18,10 @@ def pages_to_bytes(pdf_document: Document, page_start: int, page_end: int) -> By
         BytesIO: Selected subset of pages as bytes.
     """
     # Create a new PDF for the selected pages
-    select_pdf = pymupdf.open()
+    with pymupdf.open() as select_pdf:
+        for page_number in range(page_start, page_end + 1):
+            # Insert the page into the new PDF
+            select_pdf.insert_pdf(pdf_document, from_page=page_number - 1, to_page=page_number - 1)
 
-    for page_number in range(page_start, page_end + 1):
-        # Insert the page into the new PDF
-        select_pdf.insert_pdf(pdf_document, from_page=page_number - 1, to_page=page_number - 1)
-
-    # Extract bytes and close document
-    select_pdf_bytes = BytesIO(select_pdf.tobytes())
-    select_pdf.close()
-
-    return select_pdf_bytes
+        # Extract bytes and close document
+        return BytesIO(select_pdf.tobytes())
diff --git a/src/scripts/pixtral_extract_feature.py b/src/scripts/pixtral_extract_feature.py
@@ -1,5 +1,7 @@
 import json
 import logging
+import shutil
+import tempfile
 from pathlib import Path
 
 import click
@@ -23,7 +25,7 @@
 def update_ground_truth(
     ground_truth: DocumentGroundTruth, document: Path, pixtral_interface: PixtralFeatureExtraction
 ) -> DocumentGroundTruth:
-    """Runs Pixtral feature extraction on each page of a document and updates the ground truth in-place.
+    """Run Pixtral feature extraction on each page and update the ground truth pages in-place.
 
     Args:
         ground_truth (DocumentGroundTruth): Ground truth object whose pages will be updated.
@@ -39,14 +41,14 @@ def update_ground_truth(
         for ground_truth_page in ground_truth.pages:
             # Load page
             page = doc.load_page(ground_truth_page.page - 1)
-            # Extarct OCR text
+            # Extract OCR text
             extraction_context = extract_and_cache_page_data(page)
             lines = extraction_context.text_lines
             text_blocks = create_text_blocks(lines)
             text = "\n".join([line.text for block in text_blocks for line in block.lines])
 
             # Extract feature (title)
-            if text and page:
+            if text:
                 ground_truth_page.title = pixtral_interface.find(text=text, page=page)
             else:
                 ground_truth_page.title = None
diff --git a/src/utils/utility.py b/src/utils/utility.py
@@ -91,7 +91,7 @@ def get_pdf_files(input_path: Path) -> list[Path]:
 
 
 def standardize_text(text: str) -> str:
-    """Standardize text by removing new lines, double spaces and uppercaps.
+    """Standardize text by removing new lines, double spaces and lowercasing.
 
     Args:
         text (str): Text to standardize.