Merge pull request #294 from saitiger/METEOR

moe18 · web-flow · commit 36a8c4ce712a · 2025-03-13T18:51:28.000-04:00
Implementation of METEOR
diff --git a/Problems/110_METEOR/Learn.md b/Problems/110_METEOR/Learn.md
@@ -0,0 +1,87 @@
+METEOR(Metric for Evaluation of Translation with Explicit ORdering) is a metric generally used for 
+machine translation and evaluating the text output of generative AI models. METEOR build was introduced to address 
+the limitations in earlier metrics like BLEU.
+
+## Key Characteristics
+- Considers semantic similarity beyond exact word matching
+- Accounts for word order and translation variations
+- Provides more human-aligned translation assessment
+
+# Implementation 
+1. **Tokenization**
+
+2. **Frequency of matching words** : Matching needs to be exact
+
+3. **Calculate Precision, Recall and F-mean**
+```
+   F_mean = (Precision * Recall) / (α * Precision + (1 - α) * Recall)
+```
+   - α typically set to 0.9
+   - Balances precision and recall
+
+4. **Fragmentation Penalty**
+   ```
+   Chunks = Count of contiguous matched word sequences
+   Penalty = γ * (Chunks / Matches)^β
+   ```
+   - β controls penalty weight (typically 3)
+   - γ limits maximum penalty (typically 0.5)
+
+5. **Final METEOR Score**
+   ```
+   METEOR = F_mean * (1 - Penalty)
+   ```
+   - Ranges from 0 (no match) to 1 (perfect match)
+
+**__Note__** : The [paper](https://aclanthology.org/W05-0909/) that introduced the metric doesn't have the parameters (α,β, and γ) as tunable parameters, but implementation in other libraries like NLTK offers this flexibility.
+
+# Example 
+
+- Reference: "The quick brown fox jumps over the lazy dog"
+- Candidate: "A quick brown fox jumps over a lazy dog"
+
+### 1. Tokenization
+- Reference Tokens: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
+- Candidate Tokens: ['a', 'quick', 'brown', 'fox', 'jumps', 'over', 'a', 'lazy', 'dog']
+
+### 2. Unigram Matching
+- Matching tokens: ['quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog']
+- Matches: 7
+
+### 3. Unigram Precision and Recall Calculation
+- Precision = Matches / Candidate Length = 7 / 9 ≈ 0.778
+
+- Recall = Matches / Reference Length = 7 / 9 ≈ 0.778
+
+### 4. F-mean Calculation (α = 0.9)
+```
+F_mean = (Precision * Recall) / (α * Precision + (1 - α) * Recall)
+       = (0.778 * 0.778) / (0.9 * 0.778 + (1 - 0.9) * 0.778)
+       = 0.606 / (0.7 + 0.078)
+       = 0.606 / 0.778
+       ≈ 0.779
+```
+
+### 5. Chunk Calculation
+- Contiguous matched sequences:
+  1. ['quick', 'brown', 'fox']
+  2. ['jumps', 'over']
+  3. ['lazy', 'dog']
+- Number of Chunks: 3
+- Total Number of Unigram Matches: 7
+
+### 6. Penalty Calculation (β = 3, γ = 0.5)
+```
+Penalty = γ * (Number of Chunks / Total Number of Unigram Matches)^β
+        = 0.5 * (3 / 7)^3
+        = 0.5 * (0.429)^3
+        ≈ 0.039
+```
+
+### 7. Final METEOR Score
+```
+METEOR = F_mean * (1 - Penalty)
+       = 0.779 * (1 - 0.039)
+       = 0.779 * 0.961
+       ≈ 0.749
+```
diff --git a/Problems/110_METEOR/solution.py b/Problems/110_METEOR/solution.py
@@ -0,0 +1,100 @@
+import numpy as np
+from collections import Counter
+
+def meteor_score(reference, candidate, alpha=0.9, beta=3, gamma=0.5):
+    if not reference or not candidate:
+        raise ValueError("Reference and candidate cannot be empty")
+    
+    # Tokenize and count
+    ref_tokens = reference.lower().split()
+    cand_tokens = candidate.lower().split()
+
+    # Counter for unigram for reference and candidate 
+    ref_counts = Counter(ref_tokens) 
+    cand_counts = Counter(cand_tokens)
+    
+    # Calculate matches
+    num_matches = sum((ref_counts & cand_counts).values()) # Number of matching words in candidate and reference 
+    ref_len = len(ref_tokens)
+    cand_len = len(cand_tokens)  
+
+    # Unigram Precision and Recall 
+    precision = num_matches / cand_len if cand_len > 0 else 0 # Avoiding Division by zero
+    recall = num_matches / ref_len if ref_len > 0 else 0 # Avoiding Division by zero 
+    
+    if num_matches == 0:
+        return 0.0
+    
+    fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
+
+    # Chunk calculation 
+    matched_positions = []
+    ref_positions = {}  # Store positions of words in reference
+    used_positions = set()  # Track already used indices
+
+    # Populate reference positions for word alignment tracking
+    for i, word in enumerate(ref_tokens):
+        ref_positions.setdefault(word, []).append(i)
+
+    # Determine the sequence of matched positions in reference
+    for word in cand_tokens:
+        if word in ref_positions:
+            for pos in ref_positions[word]:
+                if pos not in used_positions:
+                    matched_positions.append(pos)
+                    used_positions.add(pos)
+                    break  # Ensure each match is used only once
+
+    # Count chunks by detecting breaks in position sequence
+    num_chunks = 1 if matched_positions else 0
+    for i in range(1, len(matched_positions)):
+        if matched_positions[i] != matched_positions[i - 1] + 1:
+            num_chunks += 1  # Break in sequence → new chunk
+
+    # Fragmentation penalty
+    penalty = gamma * ((num_chunks / num_matches) ** beta) if num_matches > 0 else 0
+    
+    # Final score
+    return round(fmean * (1 - penalty), 3) # Rounding to 3 Decimal places 
+
+def test_meteor_score():
+    # Test Case 1: Identical translations
+    ref_test1 = "The cat sits on the mat"
+    cand_test1 = "The cat sits on the mat"
+    expected1 = 1.0
+    assert meteor_score(ref_test1, cand_test1) == expected1, "Test Case 1 Failed"
+    
+    # Test Case 2: Similar translations
+    ref_test2 = "The quick brown fox jumps over the lazy dog"
+    cand_test2 = "A quick brown fox jumps over a lazy dog"
+    expected2 = 0.991
+    assert meteor_score(ref_test2, cand_test2) == expected2, "Test Case 2 Failed"
+    
+    # Test Case 3: Completely different translations
+    ref_test3 = "The cat sits on the mat"
+    cand_test3 = "Dogs run in the park"
+    expected3 = 0.0
+    assert meteor_score(ref_test3, cand_test3) == expected3, "Test Case 3 Failed"
+    
+    # Test Case 4: Partially matching translations
+    ref_test4 = "Machine learning is an exciting field"
+    cand_test4 = "Machine learning algorithms are fascinating"
+    expected4 = 0.667
+    assert meteor_score(ref_test4, cand_test4) == expected4, "Test Case 4 Failed"
+    
+    # Test Case 5: Empty input handling
+    try:
+        meteor_score("", "Some text")
+        assert False, "Test Case 5 Failed"
+    except ValueError:
+        pass
+    
+    # Test Case 6: Partial match with penalty
+    ref_test6 = "The cat sits on the mat"
+    cand_test6 = "The cat on the mat sits"
+    expected6 = 0.933
+    assert meteor_score(ref_test6, cand_test6) == expected6, "Test Case 6 Failed"
+    
+if __name__ == "__main__":
+    test_meteor_score()
+    print("All Test Cases Passed!")