feat(similarity): support swu signs

AmitMY · AmitMY · commit 7cdf31b3b0d1 · 2025-05-07T14:05:22.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .idea/
 build/
 signwriting_evaluation.egg-info/
-**/__pycache__/
+**/__pycache__/
+.env
diff --git a/signwriting_evaluation/metrics/similarity.py b/signwriting_evaluation/metrics/similarity.py
@@ -4,6 +4,7 @@
 from scipy.optimize import linear_sum_assignment
 from scipy.spatial import distance as dis
 from signwriting.formats.fsw_to_sign import fsw_to_sign
+from signwriting.formats.swu_to_fsw import swu2fsw
 from signwriting.tokenizer import normalize_signwriting
 from signwriting.types import Sign, SignSymbol
 
@@ -98,17 +99,22 @@ def error_rate(self, hyp: Sign, ref: Sign) -> float:
         return length_weight + mean_cost * (1 - length_weight)
 
     def score_single_sign(self, hypothesis: str, reference: str) -> float:
+        print("scoring", hypothesis, reference)
         # Calculate the evaluate score for a given hypothesis and ref.
         hyp = fsw_to_sign(hypothesis)
         ref = fsw_to_sign(reference)
         return pow(1 - self.error_rate(hyp, ref), 2)
 
+    def _text_to_signs(self, text: str) -> list[str]:
+        text_as_fsw = swu2fsw(text) # converts swu symbols to fsw, while keeping the fsw symbols if present
+        return normalize_signwriting(text_as_fsw).split(" ")
+
     def score(self, hypothesis: str, reference: str) -> float:
         # Here, hypothesis and reference are both FSW strings of potentially different number of signs
-        hypothesis_signs = normalize_signwriting(hypothesis).split(" ")
-        reference_signs = normalize_signwriting(reference).split(" ")
+        hypothesis_signs = self._text_to_signs(hypothesis)
+        reference_signs = self._text_to_signs(reference)
         if len(hypothesis_signs) == 1 and len(reference_signs) == 1:
-            return self.score_single_sign(hypothesis, reference)
+            return self.score_single_sign(hypothesis_signs[0], reference_signs[0])
 
         # Pad with empty strings to make sure the number of signs is the same
         if len(hypothesis_signs) != len(reference_signs):
diff --git a/signwriting_evaluation/metrics/test_similarity.py b/signwriting_evaluation/metrics/test_similarity.py
@@ -58,6 +58,13 @@ def test_bad_fsw_equals_0(self):
         self.assertIsInstance(score, float)
         self.assertAlmostEqual(score, 0)
 
+    def test_score_swu(self):
+        hypothesis = "𝠃𝤤𝤬񎱃𝤎𝣠񂇒𝣿𝤀񆕁𝣺𝤐񇆤𝣧𝤗"
+        reference = "𝠃𝤙𝤨񎵡𝤃𝣤񎲬𝤃𝣷񂈒𝣽𝤇񇆤𝣳𝤓"
+        score = self.metric.score(hypothesis, reference)
+        self.assertIsInstance(score, float)  # Check if the score is a float
+        self.assertAlmostEqual(score, 0.5509574768254414)
+
 
 if __name__ == '__main__':
     unittest.main()