Skip to content

Commit bd33b37

Browse files
typo check with llm
1 parent 41167da commit bd33b37

File tree

6 files changed

+41
-56
lines changed

6 files changed

+41
-56
lines changed

main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,10 +331,10 @@ def main(
331331
if mlflow_tracking:
332332
mlflow.end_run()
333333

334+
# Reclassify section header pages using the label of their following page
334335
documents_pages = [reclassify_section_headers(doc) for doc in documents_pages]
335336

336337
if not return_entities:
337-
# Reclassify section header pages using the label of their following page
338338
return documents_pages
339339
else:
340340
entities = forward_document_entities(documents=documents_pages)

src/classifiers/pixtral_classifier.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class PixtralMessage(BaseModel):
5454

5555
@model_validator(mode="after")
5656
def at_least_one_field(self):
57-
"""Ensure at least on field (text, image) is present."""
57+
"""Ensure at least one field (text, image) is present."""
5858
if self.text is None and self.image is None:
5959
raise ValueError("PixtralMessage must have either 'text' or 'image'")
6060
return self
@@ -359,7 +359,7 @@ def _build_conversation(self, text: str, image_bytes: bytes) -> PixtralMessageSt
359359
"""Build a minimal user message containing only a text and the target page image.
360360
361361
Args:
362-
text: Text provided along with image.
362+
text (str): Text provided along with the image.
363363
image_bytes (bytes): Encoded bytes of the page to process.
364364
365365
Returns:
@@ -382,7 +382,7 @@ def find(self, text: str, page: pymupdf.Page) -> str:
382382
"""Extract a feature from a single PDF page using the Pixtral model.
383383
384384
Args:
385-
text (str): Text provided along of image
385+
text (str): Text provided along with the image.
386386
page (pymupdf.Page): The PyMuPDF page object to process.
387387
388388
Returns:

src/entity/titlepage_parser.py

Lines changed: 25 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Convert title / section document to processed entries."""
22

3+
import re
34
from dataclasses import dataclass
45

56
import pymupdf
@@ -8,7 +9,6 @@
89

910
from src.models.feature_engineering import extract_and_cache_page_data
1011
from src.utils.text_clustering import create_text_blocks
11-
from src.utils.utility import standardize_text
1212

1313

1414
@dataclass
@@ -38,66 +38,53 @@ def __init__(self, text_block: TextBlock, rect: Rect):
3838
)
3939

4040
@property
41-
def contains_keywords(self) -> int:
42-
"""Score item if it contains a keyword.
43-
44-
Returns:
45-
int: 1 if keywords found, 0 otherwise.
46-
"""
47-
std_text = standardize_text(self.text)
48-
return int(any([keyword in std_text for keyword in ["bericht", "etude"]]))
41+
def length(self) -> float:
42+
"""Return True if the text contains more than 5 characters."""
43+
return float(len(self.text) > 5)
4944

5045
@property
51-
def horizontal_centrality(self) -> float:
52-
"""Horizontal centrality of the block.
46+
def horizontality(self) -> float:
47+
"""Return True if the block starts in the left 40% of the page width."""
48+
return float(self.rect.x0 < 0.4)
5349

54-
Returns:
55-
float: Score in [0, 1] where 1 means the block is perfectly horizontally centered.
56-
"""
57-
return 1 - 2 * abs(0.5 - (self.rect.x1 + self.rect.x0) / 2)
50+
@property
51+
def verticality(self) -> float:
52+
"""Return True if the block ends in the upper 75% of the page height."""
53+
return float(self.rect.y1 < 0.75)
5854

5955
@property
60-
def horizontal_leftness(self) -> float:
61-
"""Horizontal leftness score of the block.
56+
def non_numericality(self) -> float:
57+
"""Return the fraction of non-digit characters in the text.
6258
6359
Returns:
64-
float: Score in [0, 1] where higher values indicate left position.
60+
float: Value in [0, 1]; 1.0 means no digits, 0.0 means all digits.
6561
"""
66-
return min(1, 2 - (self.rect.x1 + self.rect.x0))
62+
n_digits = len(re.findall(r"\d", self.text))
63+
n_total = len(self.text)
64+
return 1 - (n_digits / max(n_total, 1))
6765

6866
@property
6967
def font(self) -> float:
70-
"""Normalized font size proxy.
71-
72-
Returns:
73-
float: Normalized line height in [0, 1] coordinate space.
74-
"""
68+
"""Return an approximate normalised font size (block height per line)."""
7569
return self.rect.height / max(self.line_count, 1)
7670

7771
@property
7872
def highness(self) -> float:
79-
"""Vertical position score.
80-
81-
Higher values for blocks closer to the top of the page.
82-
83-
Returns:
84-
float: Score in [0, 1] where 1 means the block starts at the very top of the page.
85-
"""
73+
"""Return a score favouring blocks near the top of the page."""
8674
return 1 - self.rect.y0
8775

8876
@property
8977
def score(self) -> float:
90-
"""Combined title-likelihood score.
78+
"""Return a composite title-likelihood score.
9179
92-
The metric is based on horizontal centrality, font size, and vertical position.
80+
Multiplies all heuristic signals: font size, horizontal position,
81+
vertical position, text length, non-numericality, and highness.
82+
A higher score indicates a stronger title candidate.
9383
9484
Returns:
95-
float: Estimated title-likelihood score. Higher means more likely a title.
85+
float: Non-negative composite score; 0 if any signal is False/zero.
9686
"""
97-
# TODO improve metric
98-
# return (self.horizontal_centrality * self.font * self.highness) + self.contains_keywords
99-
# return self.horizontal_centrality * self.font * self.highness
100-
return self.font
87+
return self.font * self.horizontality * self.verticality * self.length * self.non_numericality * self.highness
10188

10289

10390
def extract_title_from_page(page: pymupdf.Page) -> str:

src/entity/utils.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,10 @@ def pages_to_bytes(pdf_document: Document, page_start: int, page_end: int) -> By
1818
BytesIO: Selected subset of pages as bytes.
1919
"""
2020
# Create a new PDF for the selected pages
21-
select_pdf = pymupdf.open()
21+
with pymupdf.open() as select_pdf:
22+
for page_number in range(page_start, page_end + 1):
23+
# Insert the page into the new PDF
24+
select_pdf.insert_pdf(pdf_document, from_page=page_number - 1, to_page=page_number - 1)
2225

23-
for page_number in range(page_start, page_end + 1):
24-
# Insert the page into the new PDF
25-
select_pdf.insert_pdf(pdf_document, from_page=page_number - 1, to_page=page_number - 1)
26-
27-
# Extract bytes and close document
28-
select_pdf_bytes = BytesIO(select_pdf.tobytes())
29-
select_pdf.close()
30-
31-
return select_pdf_bytes
26+
# Extract bytes and close document
27+
return BytesIO(select_pdf.tobytes())

src/scripts/pixtral_extract_feature.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import json
22
import logging
3+
import shutil
4+
import tempfile
35
from pathlib import Path
46

57
import click
@@ -23,7 +25,7 @@
2325
def update_ground_truth(
2426
ground_truth: DocumentGroundTruth, document: Path, pixtral_interface: PixtralFeatureExtraction
2527
) -> DocumentGroundTruth:
26-
"""Runs Pixtral feature extraction on each page of a document and updates the ground truth in-place.
28+
"""Run Pixtral feature extraction on each page and update the ground truth pages in-place.
2729
2830
Args:
2931
ground_truth (DocumentGroundTruth): Ground truth object whose pages will be updated.
@@ -39,14 +41,14 @@ def update_ground_truth(
3941
for ground_truth_page in ground_truth.pages:
4042
# Load page
4143
page = doc.load_page(ground_truth_page.page - 1)
42-
# Extarct OCR text
44+
# Extract OCR text
4345
extraction_context = extract_and_cache_page_data(page)
4446
lines = extraction_context.text_lines
4547
text_blocks = create_text_blocks(lines)
4648
text = "\n".join([line.text for block in text_blocks for line in block.lines])
4749

4850
# Extract feature (title)
49-
if text and page:
51+
if text:
5052
ground_truth_page.title = pixtral_interface.find(text=text, page=page)
5153
else:
5254
ground_truth_page.title = None

src/utils/utility.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def get_pdf_files(input_path: Path) -> list[Path]:
9191

9292

9393
def standardize_text(text: str) -> str:
94-
"""Standardize text by removing new lines, double spaces and uppercaps.
94+
"""Standardize text by removing new lines, double spaces and lowercasing.
9595
9696
Args:
9797
text (str): Text to standardize.

0 commit comments

Comments
 (0)