Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ jobs:
- name: Cache docker/setup-buildx
uses: docker/setup-buildx-action@v3

- name: Increase disk space available for building images
run: |
sudo rm -rf \
/usr/share/dotnet \
/usr/local/lib/android \
/usr/local/.ghcup \
/opt/ghc \
"$AGENT_TOOLSDIRECTORY" \
/usr/local/share/powershell \
/usr/share/swift \
/usr/lib/jvm || true

- name: Disk Usage - initial size
run: |
echo "Disk usage summary:"
Expand Down
28 changes: 17 additions & 11 deletions src/pyonb/analysis/eval_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from pyonb.analysis.metrics import cer, ned, wer


def read_file(file_path: Path) -> str | dict:
def read_file(file_path: Path, file_encoding: str | None = None) -> str | dict:
"""Read .txt or .json file."""
with Path.open(file_path, "r") as f:
with Path.open(file_path, "r", encoding=file_encoding) as f:
file_type = file_path.suffix.lower()

if file_type == ".json":
Expand All @@ -31,14 +31,10 @@ def evaluate_metrics(gt_text: str, ocr_text: str) -> dict:
return {"cer": cer_result, "wer": wer_result, "ned": ned_result}


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run and evaluate OCR performance metrics.")
parser.add_argument("-gt", "--ground_truth_file", type=str, required=True, help="[.txt] Path to ground truth file.")
parser.add_argument("-ocr", "--ocr_file", type=str, required=True, help="[.json/.txt] Path to OCR processed file.")
args = parser.parse_args()

gt_file_output = read_file(Path(args.ground_truth_file))
ocr_file_output = read_file(Path(args.ocr_file))
def run(gt_path: Path, ocr_path: Path) -> dict:
"""Run OCR evaluation given ground truth and OCR file paths."""
gt_file_output = read_file(gt_path)
ocr_file_output = read_file(ocr_path)

if isinstance(ocr_file_output, str):
result = evaluate_metrics(str(gt_file_output), str(ocr_file_output))
Expand All @@ -49,4 +45,14 @@ def evaluate_metrics(gt_text: str, ocr_text: str) -> dict:
msg = "OCR file is not .txt or .json."
raise TypeError(msg)

print(f"OCR Evaluation results:\n{result}") # noqa: T201
return result


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run and evaluate OCR performance metrics.")
parser.add_argument("-gt", "--ground_truth_file", type=str, required=True, help="[.txt] Path to ground truth file.")
parser.add_argument("-ocr", "--ocr_file", type=str, required=True, help="[.json/.txt] Path to OCR processed file.")
args = parser.parse_args()

results = run(Path(args.ground_truth_file), Path(args.ocr_file))
print(f"OCR Evaluation results:\n{results}") # noqa: T201
26 changes: 22 additions & 4 deletions src/pyonb/analysis/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def cer(gt: str, pred: str) -> float:
Character Error Rate (CER): edit distance / length of ground truth.

CER = 0 - Perfect character match
CER = 1 - Completely different
CER = >0 - percentage of character changes or insertions to match - can be >100% if insertion required
"""
if not gt:
return float("inf") if pred else 0.0
Expand All @@ -20,11 +20,29 @@ def wer(gt: str, pred: str) -> float:
Word Error Rate (WER): edit distance over tokenized words.

WER = 0 - Perfect word match
WER = 1 - Completely different
WER = >0 - percentage of word changes or insertions to match - can be >100% if insertion required
"""
gt_words = gt.split()
pred_words = pred.split()
return round(Levenshtein.distance(" ".join(gt_words), " ".join(pred_words)) / max(1, len(gt_words)), 3)

# dynamic programming matrix
dp = [[0] * (len(pred_words) + 1) for _ in range(len(gt_words) + 1)]

for i in range(len(gt_words) + 1):
dp[i][0] = i
for j in range(len(pred_words) + 1):
dp[0][j] = j

for i in range(1, len(gt_words) + 1):
for j in range(1, len(pred_words) + 1):
cost = 0 if gt_words[i - 1] == pred_words[j - 1] else 1
dp[i][j] = min(
dp[i - 1][j] + 1, # deletion
dp[i][j - 1] + 1, # insertion
dp[i - 1][j - 1] + cost, # substitution
)

return round(dp[len(gt_words)][len(pred_words)] / max(1, len(gt_words)), 3)


def emr(gt_list: list[str], pred_list: list[str]) -> float:
Expand All @@ -41,7 +59,7 @@ def ned(gt: str, pred: str) -> float:
Normalized Edit Distance: edit distance / max length.

NED = 0 - Perfect match, strings identical
NED = 1 - Maximum dissimilarity
NED = 1 - all characters changed to make strings identical
"""
max_len = max(len(gt), len(pred))
if max_len == 0:
Expand Down
Loading