Skip to content

Commit d34e2ca

Browse files
increase f1 to 25%
1 parent 5fbccf0 commit d34e2ca

File tree

3 files changed

+44
-4
lines changed

3 files changed

+44
-4
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ dependencies = [
2121
"joblib>=1.5.1,<2.0.0",
2222
"numpy>=2.0.0,<3.0.0",
2323
"opencv-python>=4.12.0.88,<5.0.0.0",
24-
"pandas>=2.3.1,<3.0.0",
2524
"pillow>=11.3.0,<12.0.0",
2625
"pymupdf==1.26.3",
2726
"pydantic==2.11.7",
@@ -37,6 +36,7 @@ dependencies = [
3736
"tqdm>=4.67.1,<5.0.0",
3837
"xgboost==3.0.5",
3938
"uvicorn==0.35.0",
39+
"levenshtein==0.27.1",
4040
]
4141

4242
[project.optional-dependencies]

src/entity/titlepage_parser.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Convert title / section document to processed entries."""
22

3-
from dataclasses import dataclass
43
from pathlib import Path
54

65
import pymupdf

src/evaluation.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
import json
33
import logging
44
import os
5+
import re
6+
import unicodedata
57
from pathlib import Path
68

79
from dotenv import load_dotenv
10+
from Levenshtein import distance
811
from pydantic import TypeAdapter
912

1013
from src.page_classes import PageClasses
@@ -53,6 +56,43 @@ def groundtruth_doc_to_pages(documents: list[DocumentGroundTruth]) -> dict[str,
5356
return {f"{doc.filename}-{page.page}": page for doc in documents for page in doc.pages}
5457

5558

59+
def standardize_text(text: str) -> str:
60+
"""Standardize text by removing new lines, double spaces and uppercaps.
61+
62+
Args:
63+
text (str): Text to standardize.
64+
65+
Returns:
66+
str: Standardized text.
67+
"""
68+
# Remove new lines
69+
text = text.replace("\n", " ")
70+
# Remove double spaces
71+
text = re.sub(r"\s+", " ", text).strip()
72+
# Remove accents "ü" -> "u"
73+
text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
74+
# Enforce lowercases
75+
return text.lower()
76+
77+
78+
def are_texts_close(text_gt: str, text_pred: str, r_error: float = 0.25) -> bool:
79+
"""Check if two texts are similar based on Levenshtein distance.
80+
81+
Before matching the tiles are standardized.
82+
83+
Args:
84+
text_gt (str): Ground truth text.
85+
text_pred (str): Predicted text.
86+
r_error (float, optional): Accepted relative error. Defaults to 1e-1.
87+
88+
Returns:
89+
bool: True if both text are consifered close to eachothers.
90+
"""
91+
text_gt = standardize_text(text_gt)
92+
text_pred = standardize_text(text_pred)
93+
return distance(text_gt, text_pred) / max(1, len(text_gt)) < r_error
94+
95+
5696
def compute_classification_stats(predictions: dict[str, DocumentPage], ground_truth: dict[str, DocumentPage]) -> dict:
5797
"""Compute per-label classification confusion statistics over matched page keys.
5898
@@ -101,15 +141,16 @@ def compute_title_stats(predictions: dict[str, DocumentPage], ground_truth: dict
101141
for key in common_keys:
102142
pred_title = predictions[key].title
103143
gt_title = ground_truth[key].title
104-
logger.info(f"{key}: {gt_title} == {pred_title}")
105144
# Check if GT exists
106145
if not gt_title:
107146
continue
108147

109148
# Measure
110-
if pred_title == gt_title:
149+
if pred_title and are_texts_close(gt_title, pred_title):
111150
stats["true_positives"] += 1
112151
else:
152+
# TODO: remove before final PR
153+
logger.info(f"{key}: {gt_title} == {pred_title}")
113154
stats["false_positives"] += 1
114155
stats["false_negatives"] += 1
115156

0 commit comments

Comments
 (0)