Skip to content

Commit 47f15b1

Browse files
first estimation for title and section
1 parent 33d258c commit 47f15b1

File tree

5 files changed

+202
-26
lines changed

5 files changed

+202
-26
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ venv/
2121
minio
2222

2323
# IDE config
24+
.claude/
2425
.idea/
2526
.vscode/*
2627
!.vscode/launch.json.template.jsonc

main.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
from dotenv import load_dotenv
99
from swissgeol_doc_processing.utils.file_utils import read_params as swissgeol_read_params
1010

11-
from src.boreprofile.entity_parser import document_to_boreprofiles
1211
from src.classifiers.classifier_factory import ClassifierTypes, create_classifier
1312
from src.constants import DEFAULT_TREEBASED_MODEL_PATH
13+
from src.entity.borehole_parser import document_to_boreprofiles
14+
from src.entity.titlepage_parser import document_to_titlepages
1415
from src.page_classes import PageClasses
1516
from src.page_structure import (
1617
ProcessedEntities,
@@ -34,8 +35,22 @@
3435

3536

3637
def setup_mlflow(
37-
input_path: Path, ground_truth_path: Path, model_path: str, matching_params: dict, classifier_name: str
38+
input_path: Path,
39+
matching_params: dict,
40+
ground_truth_path: Path | None,
41+
model_path: str | None = None,
42+
classifier_name: str | None = None,
3843
):
44+
"""Configure MLflow tracking with experiment metadata and git information.
45+
46+
Args:
47+
input_path (Path): Path to input PDF directory.
48+
matching_params (dict): Dictionary of matching parameters.
49+
ground_truth_path (Path | None): Path to ground truth JSON file, or None to skip.
50+
model_path (str | None): Path to pretrained model file, or None to use the default.
51+
classifier_name (str | None): Name of the classifier being used, or None if not applicable.
52+
53+
"""
3954
mlflow.set_experiment("PDF Page Classification")
4055
mlflow.start_run()
4156

@@ -60,7 +75,17 @@ def setup_mlflow(
6075
logger.warning(f"Could not attach Git metadata to MLflow: {e}")
6176

6277

63-
def flatten_dict(d, parent_key="", sep=".") -> dict:
78+
def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
79+
"""Flatten a nested dictionary into a single-level dictionary.
80+
81+
Args:
82+
d (dict): Dictionary to flatten.
83+
parent_key (str): Parent key prefix for nested keys.
84+
sep (str): Separator character for joining keys (default ".").
85+
86+
Returns:
87+
dict: A flattened dictionary with separated keys.
88+
"""
6489
items = []
6590
for k, v in d.items():
6691
new_key = f"{parent_key}{sep}{k}" if parent_key else k
@@ -182,6 +207,10 @@ def forward_document_entities_group(
182207
"""
183208
if classification == PageClasses.BOREPROFILE:
184209
return document_to_boreprofiles(pdf_file=pdf_file, page_start=page_start, page_end=page_end, lang=language)
210+
elif classification == PageClasses.TITLE_PAGE or classification == PageClasses.SECTION_HEADER:
211+
return document_to_titlepages(
212+
pdf_file=pdf_file, classification=classification, page_start=page_start, page_end=page_end, lang=language
213+
)
185214
else:
186215
return [
187216
ProcessedEntities(
Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,14 @@
66

77
import pymupdf
88
from extraction.runner import extract
9-
from pymupdf import Document
109

10+
from src.entity.utils import _select_pages
1111
from src.page_classes import PageClasses
1212
from src.page_structure import ProcessedEntities
1313

1414
logger = logging.getLogger(__name__)
1515

1616

17-
def _select_pages(pdf_document: Document, page_numbers: list[int]) -> Document:
18-
"""Select pages from PDF.
19-
20-
Args:
21-
pdf_document (Document): PDF to split.
22-
page_numbers (list[int]): List of pages to extract (1-based).
23-
24-
Returns:
25-
Document: Selected subset.
26-
"""
27-
# Create a new PDF for the selected pages
28-
select_pdf = pymupdf.open()
29-
30-
for page_number in page_numbers:
31-
# Insert the page into the new PDF
32-
select_pdf.insert_pdf(pdf_document, from_page=page_number - 1, to_page=page_number - 1)
33-
34-
return select_pdf
35-
36-
3717
def _find_undetected_pages(
3818
entities: list[ProcessedEntities],
3919
page_numbers: list[int],
@@ -106,7 +86,7 @@ def document_to_boreprofiles(
10686
pdf_file (Path): Path to pdf file.
10787
page_start (int): Starting page (1-based).
10888
page_end (int): Ending page (1-based).
109-
lang (str): Detected language.
89+
lang (str | None): Detected language.
11090
11191
Returns:
11292
list[ProcessedEntities]: List of boreprofile as entities.
@@ -116,7 +96,7 @@ def document_to_boreprofiles(
11696

11797
# Open the PDF file, select pages and save
11898
with pymupdf.Document(pdf_file) as doc:
119-
pdf_document_select = _select_pages(doc, page_numbers)
99+
pdf_document_select = _select_pages(doc, page_start, page_end)
120100
bytes_document_select = BytesIO(pdf_document_select.tobytes())
121101

122102
# Write file to temp location for inference

src/entity/titlepage_parser.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""Convert title / section document to processed entries."""
2+
3+
from dataclasses import dataclass
4+
from pathlib import Path
5+
6+
import pymupdf
7+
from pymupdf import Rect
8+
from swissgeol_doc_processing.text.textblock import TextBlock
9+
10+
from src.entity.utils import _select_pages
11+
from src.models.feature_engineering import extract_and_cache_page_data
12+
from src.page_classes import PageClasses
13+
from src.page_structure import ProcessedEntities
14+
from src.utils.text_clustering import create_text_blocks
15+
16+
17+
@dataclass
18+
class TitleCandidateTextBlock:
19+
"""Normalize text block size to document resolution."""
20+
21+
text: str
22+
n_lines: int
23+
rect: pymupdf.Rect
24+
25+
def __init__(self, text_block: TextBlock, rect: Rect):
26+
"""Create a scale invariant text block.
27+
28+
The normalized text block is contained in a fictive [0, 0, 1, 1] rect.
29+
30+
Args:
31+
text_block (TextBlock): Input text block.
32+
rect (Rect): Size of the page linked to text block.
33+
"""
34+
self.text = text_block.text
35+
self.line_count = text_block.line_count
36+
self.rect = pymupdf.Rect(
37+
text_block.rect.x0 / rect.width,
38+
text_block.rect.y0 / rect.height,
39+
text_block.rect.x1 / rect.width,
40+
text_block.rect.y1 / rect.height,
41+
)
42+
43+
@property
44+
def horizontal_centrality(self) -> float:
45+
"""Horizontal centrality of the block.
46+
47+
Returns:
48+
float: Score in [0, 1] where 1 means the block is perfectly horizontally centered.
49+
"""
50+
return 1 - 2 * abs(0.5 - (self.rect.x1 + self.rect.x0) / 2)
51+
52+
@property
53+
def font(self) -> float:
54+
"""Normalized font size proxy.
55+
56+
Returns:
57+
float: Normalized line height in [0, 1] coordinate space.
58+
"""
59+
return self.rect.height / self.line_count
60+
61+
@property
62+
def highness(self) -> float:
63+
"""Vertical position score.
64+
65+
Higher values for blocks closer to the top of the page.
66+
67+
Returns:
68+
float: Score in [0, 1] where 1 means the block starts at the very top of the page.
69+
"""
70+
return 1 - self.rect.y0
71+
72+
@property
73+
def score(self) -> float:
74+
"""Combined title-likelihood score.
75+
76+
The metric is based on horizontal centrality, font size, and vertical position
77+
78+
Returns:
79+
float: Estimated title-likelihood score. Higher means more likely a title.
80+
"""
81+
return self.horizontal_centrality * self.font * self.highness
82+
83+
84+
def _extract_title_from_page(page) -> str:
85+
"""Extract the most likely title string from a single PDF page.
86+
87+
Builds text blocks from the page's text lines, wraps them as
88+
scale-invariant blocks, scores them by title-likelihood, and returns
89+
the text of the highest-scoring candidate.
90+
91+
Args:
92+
page (pymupdf.Page): The PDF page to analyse.
93+
94+
Returns:
95+
str: Detected title for the page.
96+
"""
97+
# Extract text block from page
98+
extraction_context = extract_and_cache_page_data(page)
99+
lines = extraction_context.text_lines
100+
text_blocks = create_text_blocks(lines)
101+
102+
# Create list of text candidates and return best
103+
title_candidates = [TitleCandidateTextBlock(text_block=text_block, rect=page.rect) for text_block in text_blocks]
104+
title_candidates = sorted(title_candidates, key=lambda x: x.score, reverse=True)
105+
return title_candidates[0].text
106+
107+
108+
def document_to_titlepages(
109+
pdf_file: Path, classification: PageClasses, page_start: int, page_end: int, lang: str | None
110+
) -> list[ProcessedEntities]:
111+
"""Extract title or section-header entities from a consecutive page range in a PDF.
112+
113+
Each page is processed individually and yields one ProcessedEntities entry whose `title` field
114+
contains detected title.
115+
116+
Args:
117+
pdf_file (Path): Path to the source PDF file.
118+
classification (PageClasses): Page class label to assign.
119+
page_start (int): First page index of the group (1-based).
120+
page_end (int): Last page index of the group (1-based).
121+
lang (str | None): Language code for the page group, or None if unknown.
122+
123+
Returns:
124+
list[ProcessedEntities]: One ProcessedEntities per page, each with its `title`
125+
field set to the highest-scoring title candidate extracted from that page.
126+
"""
127+
# Open the PDF file, select pages and save
128+
with pymupdf.Document(pdf_file) as doc:
129+
pdf_document_select = _select_pages(doc, page_start, page_end)
130+
131+
return [
132+
ProcessedEntities(
133+
classification=classification,
134+
page_start=page_start,
135+
page_end=page_end,
136+
language=lang,
137+
title=_extract_title_from_page(page=page),
138+
)
139+
for page in pdf_document_select.pages()
140+
]

src/entity/utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""Base utils for entity extraction."""
2+
3+
import pymupdf
4+
from pymupdf import Document
5+
6+
7+
def _select_pages(pdf_document: Document, page_start: int, page_end: int) -> Document:
8+
"""Select pages from PDF.
9+
10+
Args:
11+
pdf_document (Document): PDF to split.
12+
page_start (int): Start page (1-based).
13+
page_end (int): End page (1-based).
14+
15+
Returns:
16+
Document: Selected subset.
17+
"""
18+
# Create a new PDF for the selected pages
19+
select_pdf = pymupdf.open()
20+
21+
page_numbers = list(range(page_start, page_end + 1))
22+
for page_number in page_numbers:
23+
# Insert the page into the new PDF
24+
select_pdf.insert_pdf(pdf_document, from_page=page_number - 1, to_page=page_number - 1)
25+
26+
return select_pdf

0 commit comments

Comments
 (0)