documentation imporvement

christianabbet · christianabbet · commit 41167da4f880 · 2026-03-05T11:52:18.000+01:00
diff --git a/main.py b/main.py
@@ -97,13 +97,13 @@ def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
 
 
 def group_consecutive(pages: list[ProcessorPage]) -> list[list[ProcessorPage]]:
-    """Group sorted integers into consecutive sequences.
+    """Group sorted pages into consecutive sequences.
 
     Args:
-        pages (list[ProcessorPage]): Pages to group.
+        pages (list[ProcessorPage]): Pages to group by consecutive page numbers.
 
     Returns:
-        list[list[ProcessorPage]]: List of sorted pages group.
+        list[list[ProcessorPage]]: List of consecutive page groups.
     """
     sorted_pages = sorted(pages, key=lambda p: p.page)
 
@@ -201,7 +201,7 @@ def forward_document_entities_group(
         classification (PageClasses): The classification type of the page group.
         page_start (int): First page index in the consecutive group (1-based).
         page_end (int): Last page index in the consecutive group (1-based).
-        title (str): Title for the given set of documents.
+        title (str | None): Title for the given set of documents.
         language (str | None): Detected language of the page group.
         pdf_file (Path): Path to the source PDF file.
 
diff --git a/src/classifiers/pixtral_classifier.py b/src/classifiers/pixtral_classifier.py
@@ -20,20 +20,34 @@
 
 
 class PixtralImageSource(BaseModel):
-    """Raw bytes payload for an image."""
+    """Raw bytes payload for an image.
+
+    Attributes:
+        bytes_ (bytes): Raw image bytes.
+    """
 
     bytes_: bytes = Field(alias="bytes")
 
 
 class PixtralImage(BaseModel):
-    """Image content block containing its format and raw bytes source."""
+    """Image content block containing its format and raw bytes source.
+
+    Attributes:
+        format_ (str): Image format (e.g., 'jpeg').
+        source (PixtralImageSource): Container for image bytes.
+    """
 
     format_: str = Field(alias="format")
     source: PixtralImageSource
 
 
 class PixtralMessage(BaseModel):
-    """A single content block in a Pixtral conversation, either text or image."""
+    """A single content block in a Pixtral conversation, either text or image.
+
+    Attributes:
+        text (str | None): Text content, or None if image is provided.
+        image (PixtralImage | None): Image content, or None if text is provided.
+    """
 
     text: str | None = None
     image: PixtralImage | None = None
@@ -47,20 +61,33 @@ def at_least_one_field(self):
 
 
 class PixtralMessageStack(BaseModel):
-    """A full conversation turn with a role (e.g. 'user') and a list of content blocks."""
+    """A full conversation turn with a role and a list of content blocks.
+
+    Attributes:
+        role (str): Role identifier (e.g., 'user').
+        content (list[PixtralMessage]): List of content blocks in this turn.
+    """
 
     role: str
     content: list[PixtralMessage]
 
 
 class PixtralResponseOutput(BaseModel):
-    """The output field of response, wrapping the assistant message."""
+    """The output field of a response, wrapping the assistant message.
+
+    Attributes:
+        message (PixtralMessageStack): The assistant's response message.
+    """
 
     message: PixtralMessageStack
 
 
 class PixtralResponse(BaseModel):
-    """Top-level response, containing the model output."""
+    """Top-level response containing the model output.
+
+    Attributes:
+        output (PixtralResponseOutput): Response output wrapper.
+    """
 
     output: PixtralResponseOutput
 
@@ -92,8 +119,15 @@ def acquire(self):
             time.sleep(0.01)
 
 
-def is_throttle_error(e) -> bool:
-    """Determine whether a boto3 ClientError is a throttling or overload error."""
+def is_throttle_error(e: ClientError) -> bool:
+    """Determine whether a boto3 ClientError is a throttling or overload error.
+
+    Args:
+        e (ClientError): A boto3 ClientError exception.
+
+    Returns:
+        bool: True if the error is a throttling/overload error, False otherwise.
+    """
     try:
         code = e.response["Error"]["Code"]
         if code in {
@@ -145,6 +179,9 @@ def _send_conversation(self, message: PixtralMessageStack, system: PixtralMessag
 
         Returns:
             PixtralResponse: The validated model response.
+
+        Raises:
+            ClientError: If API call fails after max retries.
         """
         attempt = 0
         while True:
@@ -221,15 +258,15 @@ def __init__(
     def determine_class(
         self, page: pymupdf.Page, page_number: int, context_builder: Callable[[], PageContext] = None, **kwargs
     ) -> PageClasses:
-        """Determines the class of a document page using the Pixtral model.
+        """Determine the page class using Pixtral vision model.
 
-        Falls back to treebased classifier if output is malformed or ClientError.
+        Falls back to fallback classifier if output is malformed or API error occurs.
 
         Args:
-            page (pymupdf.Page): The page of the document that should be classified
-            page_number (int): the Page number of the page that should be classified
+            page (pymupdf.Page): The PDF page to classify.
+            page_number (int): The page number.
             context_builder (Callable): Builds page context (e.g., text blocks, lines) for fallback classifier.
-            **kwargs: Additionally passed unused arguments
+            **kwargs: Additionally passed arguments if needed.
 
         Returns:
             PageClasses: The predicted page class.
@@ -272,10 +309,10 @@ def _build_conversation(self, image_bytes: bytes) -> PixtralMessageStack:
         """Build the user message containing few-shot examples and the target image.
 
         Args:
-            image_bytes: Encoded bytes of the page to classify.
+            image_bytes (bytes): JPEG-encoded bytes of the page to classify.
 
         Returns:
-            PixtralMessageStack: A user turn ready to send.
+            PixtralMessageStack: A user turn ready to send to the model.
         """
         # List of examples for pixtral model
         content_examples = [
diff --git a/src/entity/titlepage_parser.py b/src/entity/titlepage_parser.py
@@ -39,7 +39,11 @@ def __init__(self, text_block: TextBlock, rect: Rect):
 
     @property
     def contains_keywords(self) -> int:
-        """Score item if it contains a keyword."""
+        """Score item if it contains a keyword.
+
+        Returns:
+            int: 1 if keywords found, 0 otherwise.
+        """
         std_text = standardize_text(self.text)
         return int(any([keyword in std_text for keyword in ["bericht", "etude"]]))
 
@@ -54,7 +58,12 @@ def horizontal_centrality(self) -> float:
 
     @property
     def horizontal_leftness(self) -> float:
-        return max(1, 2 - (self.rect.x1 + self.rect.x0))
+        """Horizontal leftness score of the block.
+
+        Returns:
+            float: Score in [0, 1] where higher values indicate left position.
+        """
+        return min(1, 2 - (self.rect.x1 + self.rect.x0))
 
     @property
     def font(self) -> float:
diff --git a/src/schemas.py b/src/schemas.py
@@ -4,21 +4,37 @@
 
 
 class DocumentMetadata(BaseModel):
-    """Document-level metadata extracted from a PDF."""
+    """Document-level metadata extracted from a PDF.
+
+    Attributes:
+        page_count (int): Total number of pages in the document.
+    """
 
     page_count: int
 
 
 class DocumentPage(BaseModel):
-    """Classification annotation for a single page."""
+    """Classification annotation for a single page.
+
+    Attributes:
+        page (int): Page number.
+        title (str | None): Extracted title for the page.
+        classification (dict[PageClasses, int]): Per-label binary classification (0 or 1).
+    """
 
     page: int
     title: str | None = None
     classification: dict[PageClasses, int]
 
 
 class DocumentGroundTruth(BaseModel):
-    """Ground-truth annotation for a complete PDF document."""
+    """Ground-truth annotation for a complete PDF document.
+
+    Attributes:
+        filename (str): Name of the PDF file.
+        metadata (DocumentMetadata): Document-level metadata.
+        pages (list[DocumentPage]): Per-page annotations.
+    """
 
     filename: str
     metadata: DocumentMetadata