Skip to content

Commit 41167da

Browse files
documentation imporvement
1 parent e7333a0 commit 41167da

File tree

4 files changed

+86
-24
lines changed

4 files changed

+86
-24
lines changed

main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,13 @@ def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
9797

9898

9999
def group_consecutive(pages: list[ProcessorPage]) -> list[list[ProcessorPage]]:
100-
"""Group sorted integers into consecutive sequences.
100+
"""Group sorted pages into consecutive sequences.
101101
102102
Args:
103-
pages (list[ProcessorPage]): Pages to group.
103+
pages (list[ProcessorPage]): Pages to group by consecutive page numbers.
104104
105105
Returns:
106-
list[list[ProcessorPage]]: List of sorted pages group.
106+
list[list[ProcessorPage]]: List of consecutive page groups.
107107
"""
108108
sorted_pages = sorted(pages, key=lambda p: p.page)
109109

@@ -201,7 +201,7 @@ def forward_document_entities_group(
201201
classification (PageClasses): The classification type of the page group.
202202
page_start (int): First page index in the consecutive group (1-based).
203203
page_end (int): Last page index in the consecutive group (1-based).
204-
title (str): Title for the given set of documents.
204+
title (str | None): Title for the given set of documents.
205205
language (str | None): Detected language of the page group.
206206
pdf_file (Path): Path to the source PDF file.
207207

src/classifiers/pixtral_classifier.py

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,20 +20,34 @@
2020

2121

2222
class PixtralImageSource(BaseModel):
23-
"""Raw bytes payload for an image."""
23+
"""Raw bytes payload for an image.
24+
25+
Attributes:
26+
bytes_ (bytes): Raw image bytes.
27+
"""
2428

2529
bytes_: bytes = Field(alias="bytes")
2630

2731

2832
class PixtralImage(BaseModel):
29-
"""Image content block containing its format and raw bytes source."""
33+
"""Image content block containing its format and raw bytes source.
34+
35+
Attributes:
36+
format_ (str): Image format (e.g., 'jpeg').
37+
source (PixtralImageSource): Container for image bytes.
38+
"""
3039

3140
format_: str = Field(alias="format")
3241
source: PixtralImageSource
3342

3443

3544
class PixtralMessage(BaseModel):
36-
"""A single content block in a Pixtral conversation, either text or image."""
45+
"""A single content block in a Pixtral conversation, either text or image.
46+
47+
Attributes:
48+
text (str | None): Text content, or None if image is provided.
49+
image (PixtralImage | None): Image content, or None if text is provided.
50+
"""
3751

3852
text: str | None = None
3953
image: PixtralImage | None = None
@@ -47,20 +61,33 @@ def at_least_one_field(self):
4761

4862

4963
class PixtralMessageStack(BaseModel):
50-
"""A full conversation turn with a role (e.g. 'user') and a list of content blocks."""
64+
"""A full conversation turn with a role and a list of content blocks.
65+
66+
Attributes:
67+
role (str): Role identifier (e.g., 'user').
68+
content (list[PixtralMessage]): List of content blocks in this turn.
69+
"""
5170

5271
role: str
5372
content: list[PixtralMessage]
5473

5574

5675
class PixtralResponseOutput(BaseModel):
57-
"""The output field of response, wrapping the assistant message."""
76+
"""The output field of a response, wrapping the assistant message.
77+
78+
Attributes:
79+
message (PixtralMessageStack): The assistant's response message.
80+
"""
5881

5982
message: PixtralMessageStack
6083

6184

6285
class PixtralResponse(BaseModel):
63-
"""Top-level response, containing the model output."""
86+
"""Top-level response containing the model output.
87+
88+
Attributes:
89+
output (PixtralResponseOutput): Response output wrapper.
90+
"""
6491

6592
output: PixtralResponseOutput
6693

@@ -92,8 +119,15 @@ def acquire(self):
92119
time.sleep(0.01)
93120

94121

95-
def is_throttle_error(e) -> bool:
96-
"""Determine whether a boto3 ClientError is a throttling or overload error."""
122+
def is_throttle_error(e: ClientError) -> bool:
123+
"""Determine whether a boto3 ClientError is a throttling or overload error.
124+
125+
Args:
126+
e (ClientError): A boto3 ClientError exception.
127+
128+
Returns:
129+
bool: True if the error is a throttling/overload error, False otherwise.
130+
"""
97131
try:
98132
code = e.response["Error"]["Code"]
99133
if code in {
@@ -145,6 +179,9 @@ def _send_conversation(self, message: PixtralMessageStack, system: PixtralMessag
145179
146180
Returns:
147181
PixtralResponse: The validated model response.
182+
183+
Raises:
184+
ClientError: If API call fails after max retries.
148185
"""
149186
attempt = 0
150187
while True:
@@ -221,15 +258,15 @@ def __init__(
221258
def determine_class(
222259
self, page: pymupdf.Page, page_number: int, context_builder: Callable[[], PageContext] = None, **kwargs
223260
) -> PageClasses:
224-
"""Determines the class of a document page using the Pixtral model.
261+
"""Determine the page class using Pixtral vision model.
225262
226-
Falls back to treebased classifier if output is malformed or ClientError.
263+
Falls back to fallback classifier if output is malformed or API error occurs.
227264
228265
Args:
229-
page (pymupdf.Page): The page of the document that should be classified
230-
page_number (int): the Page number of the page that should be classified
266+
page (pymupdf.Page): The PDF page to classify.
267+
page_number (int): The page number.
231268
context_builder (Callable): Builds page context (e.g., text blocks, lines) for fallback classifier.
232-
**kwargs: Additionally passed unused arguments
269+
**kwargs: Additionally passed arguments if needed.
233270
234271
Returns:
235272
PageClasses: The predicted page class.
@@ -272,10 +309,10 @@ def _build_conversation(self, image_bytes: bytes) -> PixtralMessageStack:
272309
"""Build the user message containing few-shot examples and the target image.
273310
274311
Args:
275-
image_bytes: Encoded bytes of the page to classify.
312+
image_bytes (bytes): JPEG-encoded bytes of the page to classify.
276313
277314
Returns:
278-
PixtralMessageStack: A user turn ready to send.
315+
PixtralMessageStack: A user turn ready to send to the model.
279316
"""
280317
# List of examples for pixtral model
281318
content_examples = [

src/entity/titlepage_parser.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@ def __init__(self, text_block: TextBlock, rect: Rect):
3939

4040
@property
4141
def contains_keywords(self) -> int:
42-
"""Score item if it contains a keyword."""
42+
"""Score item if it contains a keyword.
43+
44+
Returns:
45+
int: 1 if keywords found, 0 otherwise.
46+
"""
4347
std_text = standardize_text(self.text)
4448
return int(any([keyword in std_text for keyword in ["bericht", "etude"]]))
4549

@@ -54,7 +58,12 @@ def horizontal_centrality(self) -> float:
5458

5559
@property
5660
def horizontal_leftness(self) -> float:
57-
return max(1, 2 - (self.rect.x1 + self.rect.x0))
61+
"""Horizontal leftness score of the block.
62+
63+
Returns:
64+
float: Score in [0, 1] where higher values indicate left position.
65+
"""
66+
return min(1, 2 - (self.rect.x1 + self.rect.x0))
5867

5968
@property
6069
def font(self) -> float:

src/schemas.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,37 @@
44

55

66
class DocumentMetadata(BaseModel):
7-
"""Document-level metadata extracted from a PDF."""
7+
"""Document-level metadata extracted from a PDF.
8+
9+
Attributes:
10+
page_count (int): Total number of pages in the document.
11+
"""
812

913
page_count: int
1014

1115

1216
class DocumentPage(BaseModel):
13-
"""Classification annotation for a single page."""
17+
"""Classification annotation for a single page.
18+
19+
Attributes:
20+
page (int): Page number.
21+
title (str | None): Extracted title for the page.
22+
classification (dict[PageClasses, int]): Per-label binary classification (0 or 1).
23+
"""
1424

1525
page: int
1626
title: str | None = None
1727
classification: dict[PageClasses, int]
1828

1929

2030
class DocumentGroundTruth(BaseModel):
21-
"""Ground-truth annotation for a complete PDF document."""
31+
"""Ground-truth annotation for a complete PDF document.
32+
33+
Attributes:
34+
filename (str): Name of the PDF file.
35+
metadata (DocumentMetadata): Document-level metadata.
36+
pages (list[DocumentPage]): Per-page annotations.
37+
"""
2238

2339
filename: str
2440
metadata: DocumentMetadata

0 commit comments

Comments
 (0)