Merge pull request #39 from SAFEHR-data/tomr/ocr-analysis

p-j-smith · web-flow · commit cf4a8483a1e7 · 2025-10-06T15:51:58.000+01:00
Adds OCR evaluation script / Fixes WER calculation
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -35,6 +35,18 @@ jobs:
       - name: Cache docker/setup-buildx
         uses: docker/setup-buildx-action@v3
 
+      - name: Increase disk space available for building images
+        run: |
+          sudo rm -rf \
+            /usr/share/dotnet \
+            /usr/local/lib/android \
+            /usr/local/.ghcup \
+            /opt/ghc \
+            "$AGENT_TOOLSDIRECTORY" \
+            /usr/local/share/powershell \
+            /usr/share/swift \
+            /usr/lib/jvm || true
+
       - name: Disk Usage - initial size
         run: |
           echo "Disk usage summary:"
diff --git a/src/pyonb/analysis/eval_ocr.py b/src/pyonb/analysis/eval_ocr.py
@@ -8,9 +8,9 @@
 from pyonb.analysis.metrics import cer, ned, wer
 
 
-def read_file(file_path: Path) -> str | dict:
+def read_file(file_path: Path, file_encoding: str | None = None) -> str | dict:
     """Read .txt or .json file."""
-    with Path.open(file_path, "r") as f:
+    with Path.open(file_path, "r", encoding=file_encoding) as f:
         file_type = file_path.suffix.lower()
 
         if file_type == ".json":
@@ -31,14 +31,10 @@ def evaluate_metrics(gt_text: str, ocr_text: str) -> dict:
     return {"cer": cer_result, "wer": wer_result, "ned": ned_result}
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run and evaluate OCR performance metrics.")
-    parser.add_argument("-gt", "--ground_truth_file", type=str, required=True, help="[.txt] Path to ground truth file.")
-    parser.add_argument("-ocr", "--ocr_file", type=str, required=True, help="[.json/.txt] Path to OCR processed file.")
-    args = parser.parse_args()
-
-    gt_file_output = read_file(Path(args.ground_truth_file))
-    ocr_file_output = read_file(Path(args.ocr_file))
+def run(gt_path: Path, ocr_path: Path) -> dict:
+    """Run OCR evaluation given ground truth and OCR file paths."""
+    gt_file_output = read_file(gt_path)
+    ocr_file_output = read_file(ocr_path)
 
     if isinstance(ocr_file_output, str):
         result = evaluate_metrics(str(gt_file_output), str(ocr_file_output))
@@ -49,4 +45,14 @@ def evaluate_metrics(gt_text: str, ocr_text: str) -> dict:
         msg = "OCR file is not .txt or .json."
         raise TypeError(msg)
 
-    print(f"OCR Evaluation results:\n{result}")  # noqa: T201
+    return result
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run and evaluate OCR performance metrics.")
+    parser.add_argument("-gt", "--ground_truth_file", type=str, required=True, help="[.txt] Path to ground truth file.")
+    parser.add_argument("-ocr", "--ocr_file", type=str, required=True, help="[.json/.txt] Path to OCR processed file.")
+    args = parser.parse_args()
+
+    results = run(Path(args.ground_truth_file), Path(args.ocr_file))
+    print(f"OCR Evaluation results:\n{results}")  # noqa: T201
diff --git a/src/pyonb/analysis/metrics.py b/src/pyonb/analysis/metrics.py
@@ -8,7 +8,7 @@ def cer(gt: str, pred: str) -> float:
     Character Error Rate (CER): edit distance / length of ground truth.
 
     CER = 0 - Perfect character match
-    CER = 1 - Completely different
+    CER > 0 - ratio of character edits needed; values > 1.0 indicate more edits than original characters
     """
     if not gt:
         return float("inf") if pred else 0.0
@@ -20,11 +20,29 @@ def wer(gt: str, pred: str) -> float:
     Word Error Rate (WER): edit distance over tokenized words.
 
     WER = 0 - Perfect word match
-    WER = 1 - Completely different
+    WER > 0 - ratio of word edits needed; values > 1.0 indicate more edits than original words'
     """
     gt_words = gt.split()
     pred_words = pred.split()
-    return round(Levenshtein.distance(" ".join(gt_words), " ".join(pred_words)) / max(1, len(gt_words)), 3)
+
+    # Initialise dynamic programming matrix for edit distance calculation
+    dp = [[0] * (len(pred_words) + 1) for _ in range(len(gt_words) + 1)]
+
+    for i in range(len(gt_words) + 1):
+        dp[i][0] = i
+    for j in range(len(pred_words) + 1):
+        dp[0][j] = j
+
+    for i in range(1, len(gt_words) + 1):
+        for j in range(1, len(pred_words) + 1):
+            cost = 0 if gt_words[i - 1] == pred_words[j - 1] else 1
+            dp[i][j] = min(
+                dp[i - 1][j] + 1,  # deletion
+                dp[i][j - 1] + 1,  # insertion
+                dp[i - 1][j - 1] + cost,  # substitution
+            )
+
+    return round(dp[len(gt_words)][len(pred_words)] / max(1, len(gt_words)), 3)
 
 
 def emr(gt_list: list[str], pred_list: list[str]) -> float:
@@ -41,7 +59,7 @@ def ned(gt: str, pred: str) -> float:
     Normalized Edit Distance: edit distance / max length.
 
     NED = 0 - Perfect match, strings identical
-    NED = 1 - Maximum dissimilarity
+    NED = 1 - all characters changed to make strings identical
     """
     max_len = max(len(gt), len(pred))
     if max_len == 0: