adding unit test for tests

SinclairHudson · SinclairHudson · commit 31924fb09fef · 2024-07-04T10:12:13.000-04:00
diff --git a/llmtune/qa/generics.py b/llmtune/qa/generics.py
@@ -1,5 +1,4 @@
 import statistics
-from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, List, Union
 
@@ -14,6 +13,7 @@ class LLMMetricSuite:
     Represents and runs a suite of metrics on a set of prompts,
     golden responses, and model predictions.
     """
+
     def __init__(
         self,
         metrics: List[LLMQaMetric],
diff --git a/llmtune/qa/qa_metrics.py b/llmtune/qa/qa_metrics.py
@@ -24,6 +24,7 @@ class LLMQaMetric(ABC):
     Abstract base class for a metric. A metric can be computed over a single
     data instance, and outputs a scalar value (integer or float).
     """
+
     @property
     @abstractmethod
     def metric_name(self) -> str:
@@ -82,6 +83,7 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> U
 class DotProductSimilarityMetric(LLMQaMetric):
     """Encodes both the ground truth and model prediction using DistilBERT, and
     computes the dot product similarity between the two embeddings."""
+
     def __init__(self):
         model_name = "distilbert-base-uncased"
         self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
diff --git a/llmtune/qa/qa_tests.py b/llmtune/qa/qa_tests.py
@@ -1,5 +1,4 @@
 from abc import ABC, abstractmethod
-from typing import List, Union
 
 from langchain.evaluation import JsonValidityEvaluator
 
@@ -9,6 +8,7 @@ class LLMQaTest(ABC):
     Abstract base class for a test. A test can be computed over a single
     data instance/llm response, and outputs a boolean value (pass or fail).
     """
+
     @property
     @abstractmethod
     def test_name(self) -> str:
@@ -25,14 +25,15 @@ class JSONValidityTest(LLMQaTest):
     to langchain_core.utils.json.parse_json_markdown
     The JSON can be wrapped in markdown and this test will still pass
     """
+
     def __init__(self):
         self.json_validity_evaluator = JsonValidityEvaluator()
 
     @property
     def test_name(self) -> str:
         return "json_valid"
 
-    def get_metric(self, model_prediction: str) -> bool:
-        result = self.json_validity_evaluator.evaluate_strings(prediction=model_prediction)
+    def test(self, prompt: str, grount_truth: str, model_pred: str) -> bool:
+        result = self.json_validity_evaluator.evaluate_strings(prediction=model_pred)
         binary_res = result["score"]
         return bool(binary_res)
diff --git a/tests/qa/test_qa_metrics.py b/tests/qa/test_qa_metrics.py
@@ -1,105 +1,39 @@
 import pytest
 
-from llmtune.qa.qa_metrics import (
-    AdjectivePercentMetric,
-    DotProductSimilarityMetric,
-    JaccardSimilarityMetric,
-    JSONValidityMetric,
-    LengthMetric,
-    NounPercentMetric,
-    RougeScoreMetric,
-    VerbPercentMetric,
-    WordOverlapMetric,
+from llmtune.qa.qa_tests import (
+    JSONValidityTest,
 )
 
 
 @pytest.mark.parametrize(
-    "test_class,expected_type",
+    "test_class",
     [
-        (LengthMetric, int),
-        (JaccardSimilarityMetric, float),
-        (DotProductSimilarityMetric, float),
-        (RougeScoreMetric, float),
-        (WordOverlapMetric, float),
-        (VerbPercentMetric, float),
-        (AdjectivePercentMetric, float),
-        (NounPercentMetric, float),
-        (JSONValidityMetric, float),
+        JSONValidityTest,
     ],
 )
-def test_metric_return_type(test_class, expected_type):
+def test_test_return_bool(test_class):
+    """Test to ensure that all tests return pass/fail boolean value."""
     test_instance = test_class()
     prompt = "This is a test prompt."
     ground_truth = "This is a ground truth sentence."
     model_prediction = "This is a model predicted sentence."
 
-    # Depending on the test class, the output could be different.
-    metric_result = test_instance.get_metric(prompt, ground_truth, model_prediction)
-    assert isinstance(
-        metric_result, expected_type
-    ), f"Expected return type {expected_type}, but got {type(metric_result)}."
-
-
-def test_length_metric():
-    test = LengthMetric()
-    result = test.get_metric("prompt", "short text", "longer text")
-    assert result == 1, "Length difference should be 1."
-
-
-def test_jaccard_similarity_metric():
-    test = JaccardSimilarityMetric()
-    result = test.get_metric("prompt", "hello world", "world hello")
-    assert result == 1.0, "Jaccard similarity should be 1.0 for the same words in different orders."
-
-
-def test_dot_product_similarity_metric():
-    test = DotProductSimilarityMetric()
-    result = test.get_metric("prompt", "data", "data")
-    assert result >= 0, "Dot product similarity should be non-negative."
-
-
-def test_rouge_score_metric():
-    test = RougeScoreMetric()
-    result = test.get_metric("prompt", "the quick brown fox", "the quick brown fox jumps over the lazy dog")
-    assert result >= 0, "ROUGE precision should be non-negative."
-
-
-def test_word_overlap_metric():
-    test = WordOverlapMetric()
-    result = test.get_metric("prompt", "jump over the moon", "jump around the sun")
-    assert result >= 0, "Word overlap percentage should be non-negative."
-
-
-def test_verb_percent_metric():
-    test = VerbPercentMetric()
-    result = test.get_metric("prompt", "He eats", "He is eating")
-    assert result >= 0, "Verb percentage should be non-negative."
-
-
-def test_adjective_percent_metric():
-    test = AdjectivePercentMetric()
-    result = test.get_metric("prompt", "It is beautiful", "It is extremely beautiful")
-    assert result >= 0, "Adjective percentage should be non-negative."
-
-
-def test_noun_percent_metric():
-    test = NounPercentMetric()
-    result = test.get_metric("prompt", "The cat", "The cat and the dog")
-    assert result >= 0, "Noun percentage should be non-negative."
+    metric_result = test_instance.test(prompt, ground_truth, model_prediction)
+    assert isinstance(metric_result, bool), f"Expected return type bool, but got {type(metric_result)}."
 
 
 @pytest.mark.parametrize(
     "input_string,expected_value",
     [
-        ('{"Answer": "The cat"}', 1),
-        ("{'Answer': 'The cat'}", 0),  # Double quotes are required in json
-        ('{"Answer": "The cat",}', 0),
-        ('{"Answer": "The cat", "test": "case"}', 1),
-        ('```json\n{"Answer": "The cat"}\n```', 1),  # this json block can still be processed
-        ('Here is an example of a JSON block: {"Answer": "The cat"}', 0),
+        ('{"Answer": "The cat"}', True),
+        ("{'Answer': 'The cat'}", False),  # Double quotes are required in json
+        ('{"Answer": "The cat",}', False),  # Trailing comma is not allowed
+        ('{"Answer": "The cat", "test": "case"}', True),
+        ('```json\n{"Answer": "The cat"}\n```', True),  # this json block can still be processed
+        ('Here is an example of a JSON block: {"Answer": "The cat"}', False),
     ],
 )
-def test_json_valid_metric(input_string: str, expected_value: float):
-    test = JSONValidityMetric()
-    result = test.get_metric("prompt", "The cat", input_string)
+def test_json_valid_metric(input_string: str, expected_value: bool):
+    test = JSONValidityTest()
+    result = test.test("prompt", "The cat", input_string)
     assert result == expected_value, f"JSON validity should be {expected_value} but got {result}."
diff --git a/tests/qa/test_qa_tests.py b/tests/qa/test_qa_tests.py
@@ -0,0 +1,105 @@
+import pytest
+
+from llmtune.qa.qa_metrics import (
+    AdjectivePercentMetric,
+    DotProductSimilarityMetric,
+    JaccardSimilarityMetric,
+    JSONValidityMetric,
+    LengthMetric,
+    NounPercentMetric,
+    RougeScoreMetric,
+    VerbPercentMetric,
+    WordOverlapMetric,
+)
+
+
+@pytest.mark.parametrize(
+    "test_class,expected_type",
+    [
+        (LengthMetric, int),
+        (JaccardSimilarityMetric, float),
+        (DotProductSimilarityMetric, float),
+        (RougeScoreMetric, float),
+        (WordOverlapMetric, float),
+        (VerbPercentMetric, float),
+        (AdjectivePercentMetric, float),
+        (NounPercentMetric, float),
+        (JSONValidityMetric, float),
+    ],
+)
+def test_metric_return_type(test_class, expected_type):
+    test_instance = test_class()
+    prompt = "This is a test prompt."
+    ground_truth = "This is a ground truth sentence."
+    model_prediction = "This is a model predicted sentence."
+
+    # Depending on the test class, the output could be different.
+    metric_result = test_instance.get_metric(prompt, ground_truth, model_prediction)
+    assert isinstance(
+        metric_result, expected_type
+    ), f"Expected return type {expected_type}, but got {type(metric_result)}."
+
+
+def test_length_metric():
+    test = LengthMetric()
+    result = test.get_metric("prompt", "short text", "longer text")
+    assert result == 1, "Length difference should be 1."
+
+
+def test_jaccard_similarity_metric():
+    test = JaccardSimilarityMetric()
+    result = test.get_metric("prompt", "hello world", "world hello")
+    assert result == 1.0, "Jaccard similarity should be 1.0 for the same words in different orders."
+
+
+def test_dot_product_similarity_metric():
+    test = DotProductSimilarityMetric()
+    result = test.get_metric("prompt", "data", "data")
+    assert result >= 0, "Dot product similarity should be non-negative."
+
+
+def test_rouge_score_metric():
+    test = RougeScoreMetric()
+    result = test.get_metric("prompt", "the quick brown fox", "the quick brown fox jumps over the lazy dog")
+    assert result >= 0, "ROUGE precision should be non-negative."
+
+
+def test_word_overlap_metric():
+    test = WordOverlapMetric()
+    result = test.get_metric("prompt", "jump over the moon", "jump around the sun")
+    assert result >= 0, "Word overlap percentage should be non-negative."
+
+
+def test_verb_percent_metric():
+    test = VerbPercentMetric()
+    result = test.get_metric("prompt", "He eats", "He is eating")
+    assert result >= 0, "Verb percentage should be non-negative."
+
+
+def test_adjective_percent_metric():
+    test = AdjectivePercentMetric()
+    result = test.get_metric("prompt", "It is beautiful", "It is extremely beautiful")
+    assert result >= 0, "Adjective percentage should be non-negative."
+
+
+def test_noun_percent_metric():
+    test = NounPercentMetric()
+    result = test.get_metric("prompt", "The cat", "The cat and the dog")
+    assert result >= 0, "Noun percentage should be non-negative."
+
+
+@pytest.mark.parametrize(
+    "input_string,expected_value",
+    [
+        ('{"Answer": "The cat"}', 1),
+        ("{'Answer': 'The cat'}", 0),  # Double quotes are required in json
+        ('{"Answer": "The cat",}', 0),
+        ('{"Answer": "The cat", "test": "case"}', 1),
+        ('```json\n{"Answer": "The cat"}\n```', 1),  # this json block can still be processed
+        ('Here is an example of a JSON block: {"Answer": "The cat"}', 0),
+    ],
+)
+def test_json_valid_metric(input_string: str, expected_value: float):
+    test = JSONValidityMetric()
+    result = test.get_metric("prompt", "The cat", input_string)
+    assert result == expected_value, f"JSON validity should be {expected_value} but got {result}."