|
1 | 1 | """Convert title / section document to processed entries.""" |
2 | 2 |
|
| 3 | +import re |
3 | 4 | from dataclasses import dataclass |
4 | 5 |
|
5 | 6 | import pymupdf |
|
8 | 9 |
|
9 | 10 | from src.models.feature_engineering import extract_and_cache_page_data |
10 | 11 | from src.utils.text_clustering import create_text_blocks |
11 | | -from src.utils.utility import standardize_text |
12 | 12 |
|
13 | 13 |
|
14 | 14 | @dataclass |
@@ -38,66 +38,53 @@ def __init__(self, text_block: TextBlock, rect: Rect): |
38 | 38 | ) |
39 | 39 |
|
40 | 40 | @property |
41 | | - def contains_keywords(self) -> int: |
42 | | - """Score item if it contains a keyword. |
43 | | -
|
44 | | - Returns: |
45 | | - int: 1 if keywords found, 0 otherwise. |
46 | | - """ |
47 | | - std_text = standardize_text(self.text) |
48 | | - return int(any([keyword in std_text for keyword in ["bericht", "etude"]])) |
| 41 | + def length(self) -> float: |
| 42 | + """Return True if the text contains more than 5 characters.""" |
| 43 | + return float(len(self.text) > 5) |
49 | 44 |
|
50 | 45 | @property |
51 | | - def horizontal_centrality(self) -> float: |
52 | | - """Horizontal centrality of the block. |
| 46 | + def horizontality(self) -> float: |
| 47 | + """Return True if the block starts in the left 40% of the page width.""" |
| 48 | + return float(self.rect.x0 < 0.4) |
53 | 49 |
|
54 | | - Returns: |
55 | | - float: Score in [0, 1] where 1 means the block is perfectly horizontally centered. |
56 | | - """ |
57 | | - return 1 - 2 * abs(0.5 - (self.rect.x1 + self.rect.x0) / 2) |
| 50 | + @property |
| 51 | + def verticality(self) -> float: |
| 52 | + """Return True if the block ends in the upper 75% of the page height.""" |
| 53 | + return float(self.rect.y1 < 0.75) |
58 | 54 |
|
59 | 55 | @property |
60 | | - def horizontal_leftness(self) -> float: |
61 | | - """Horizontal leftness score of the block. |
| 56 | + def non_numericality(self) -> float: |
| 57 | + """Return the fraction of non-digit characters in the text. |
62 | 58 |
|
63 | 59 | Returns: |
64 | | - float: Score in [0, 1] where higher values indicate left position. |
| 60 | + float: Value in [0, 1]; 1.0 means no digits, 0.0 means all digits. |
65 | 61 | """ |
66 | | - return min(1, 2 - (self.rect.x1 + self.rect.x0)) |
| 62 | + n_digits = len(re.findall(r"\d", self.text)) |
| 63 | + n_total = len(self.text) |
| 64 | + return 1 - (n_digits / max(n_total, 1)) |
67 | 65 |
|
68 | 66 | @property |
69 | 67 | def font(self) -> float: |
70 | | - """Normalized font size proxy. |
71 | | -
|
72 | | - Returns: |
73 | | - float: Normalized line height in [0, 1] coordinate space. |
74 | | - """ |
| 68 | + """Return an approximate normalised font size (block height per line).""" |
75 | 69 | return self.rect.height / max(self.line_count, 1) |
76 | 70 |
|
77 | 71 | @property |
78 | 72 | def highness(self) -> float: |
79 | | - """Vertical position score. |
80 | | -
|
81 | | - Higher values for blocks closer to the top of the page. |
82 | | -
|
83 | | - Returns: |
84 | | - float: Score in [0, 1] where 1 means the block starts at the very top of the page. |
85 | | - """ |
| 73 | + """Return a score favouring blocks near the top of the page.""" |
86 | 74 | return 1 - self.rect.y0 |
87 | 75 |
|
88 | 76 | @property |
89 | 77 | def score(self) -> float: |
90 | | - """Combined title-likelihood score. |
| 78 | + """Return a composite title-likelihood score. |
91 | 79 |
|
92 | | - The metric is based on horizontal centrality, font size, and vertical position. |
| 80 | + Multiplies all heuristic signals: font size, horizontal position, |
| 81 | + vertical position, text length, non-numericality, and highness. |
| 82 | + A higher score indicates a stronger title candidate. |
93 | 83 |
|
94 | 84 | Returns: |
95 | | - float: Estimated title-likelihood score. Higher means more likely a title. |
| 85 | + float: Non-negative composite score; 0 if any signal is False/zero. |
96 | 86 | """ |
97 | | - # TODO improve metric |
98 | | - # return (self.horizontal_centrality * self.font * self.highness) + self.contains_keywords |
99 | | - # return self.horizontal_centrality * self.font * self.highness |
100 | | - return self.font |
| 87 | + return self.font * self.horizontality * self.verticality * self.length * self.non_numericality * self.highness |
101 | 88 |
|
102 | 89 |
|
103 | 90 | def extract_title_from_page(page: pymupdf.Page) -> str: |
|
0 commit comments