switching test_qa_metrics and test_qa_tests

SinclairHudson · SinclairHudson · commit d5b78432c3af · 2024-07-04T11:17:49.000-04:00
diff --git a/tests/qa/test_qa_metrics.py b/tests/qa/test_qa_metrics.py
@@ -1,39 +1,105 @@
 import pytest
 
-from llmtune.qa.qa_tests import (
-    JSONValidityTest,
+from llmtune.qa.qa_metrics import (
+    AdjectivePercentMetric,
+    DotProductSimilarityMetric,
+    JaccardSimilarityMetric,
+    JSONValidityMetric,
+    LengthMetric,
+    NounPercentMetric,
+    RougeScoreMetric,
+    VerbPercentMetric,
+    WordOverlapMetric,
 )
 
 
 @pytest.mark.parametrize(
-    "test_class",
+    "test_class,expected_type",
     [
-        JSONValidityTest,
+        (LengthMetric, int),
+        (JaccardSimilarityMetric, float),
+        (DotProductSimilarityMetric, float),
+        (RougeScoreMetric, float),
+        (WordOverlapMetric, float),
+        (VerbPercentMetric, float),
+        (AdjectivePercentMetric, float),
+        (NounPercentMetric, float),
+        (JSONValidityMetric, float),
     ],
 )
-def test_test_return_bool(test_class):
-    """Test to ensure that all tests return pass/fail boolean value."""
+def test_metric_return_type(test_class, expected_type):
     test_instance = test_class()
     prompt = "This is a test prompt."
     ground_truth = "This is a ground truth sentence."
     model_prediction = "This is a model predicted sentence."
 
-    metric_result = test_instance.test(prompt, ground_truth, model_prediction)
-    assert isinstance(metric_result, bool), f"Expected return type bool, but got {type(metric_result)}."
+    # Depending on the test class, the output could be different.
+    metric_result = test_instance.get_metric(prompt, ground_truth, model_prediction)
+    assert isinstance(
+        metric_result, expected_type
+    ), f"Expected return type {expected_type}, but got {type(metric_result)}."
+
+
+def test_length_metric():
+    test = LengthMetric()
+    result = test.get_metric("prompt", "short text", "longer text")
+    assert result == 1, "Length difference should be 1."
+
+
+def test_jaccard_similarity_metric():
+    test = JaccardSimilarityMetric()
+    result = test.get_metric("prompt", "hello world", "world hello")
+    assert result == 1.0, "Jaccard similarity should be 1.0 for the same words in different orders."
+
+
+def test_dot_product_similarity_metric():
+    test = DotProductSimilarityMetric()
+    result = test.get_metric("prompt", "data", "data")
+    assert result >= 0, "Dot product similarity should be non-negative."
+
+
+def test_rouge_score_metric():
+    test = RougeScoreMetric()
+    result = test.get_metric("prompt", "the quick brown fox", "the quick brown fox jumps over the lazy dog")
+    assert result >= 0, "ROUGE precision should be non-negative."
+
+
+def test_word_overlap_metric():
+    test = WordOverlapMetric()
+    result = test.get_metric("prompt", "jump over the moon", "jump around the sun")
+    assert result >= 0, "Word overlap percentage should be non-negative."
+
+
+def test_verb_percent_metric():
+    test = VerbPercentMetric()
+    result = test.get_metric("prompt", "He eats", "He is eating")
+    assert result >= 0, "Verb percentage should be non-negative."
+
+
+def test_adjective_percent_metric():
+    test = AdjectivePercentMetric()
+    result = test.get_metric("prompt", "It is beautiful", "It is extremely beautiful")
+    assert result >= 0, "Adjective percentage should be non-negative."
+
+
+def test_noun_percent_metric():
+    test = NounPercentMetric()
+    result = test.get_metric("prompt", "The cat", "The cat and the dog")
+    assert result >= 0, "Noun percentage should be non-negative."
 
 
 @pytest.mark.parametrize(
     "input_string,expected_value",
     [
-        ('{"Answer": "The cat"}', True),
-        ("{'Answer': 'The cat'}", False),  # Double quotes are required in json
-        ('{"Answer": "The cat",}', False),  # Trailing comma is not allowed
-        ('{"Answer": "The cat", "test": "case"}', True),
-        ('```json\n{"Answer": "The cat"}\n```', True),  # this json block can still be processed
-        ('Here is an example of a JSON block: {"Answer": "The cat"}', False),
+        ('{"Answer": "The cat"}', 1),
+        ("{'Answer': 'The cat'}", 0),  # Double quotes are required in json
+        ('{"Answer": "The cat",}', 0),
+        ('{"Answer": "The cat", "test": "case"}', 1),
+        ('```json\n{"Answer": "The cat"}\n```', 1),  # this json block can still be processed
+        ('Here is an example of a JSON block: {"Answer": "The cat"}', 0),
     ],
 )
-def test_json_valid_metric(input_string: str, expected_value: bool):
-    test = JSONValidityTest()
-    result = test.test("prompt", "The cat", input_string)
+def test_json_valid_metric(input_string: str, expected_value: float):
+    test = JSONValidityMetric()
+    result = test.get_metric("prompt", "The cat", input_string)
     assert result == expected_value, f"JSON validity should be {expected_value} but got {result}."
diff --git a/tests/qa/test_qa_tests.py b/tests/qa/test_qa_tests.py
@@ -1,105 +1,39 @@
 import pytest
 
-from llmtune.qa.qa_metrics import (
-    AdjectivePercentMetric,
-    DotProductSimilarityMetric,
-    JaccardSimilarityMetric,
-    JSONValidityMetric,
-    LengthMetric,
-    NounPercentMetric,
-    RougeScoreMetric,
-    VerbPercentMetric,
-    WordOverlapMetric,
+from llmtune.qa.qa_tests import (
+    JSONValidityTest,
 )
 
 
 @pytest.mark.parametrize(
-    "test_class,expected_type",
+    "test_class",
     [
-        (LengthMetric, int),
-        (JaccardSimilarityMetric, float),
-        (DotProductSimilarityMetric, float),
-        (RougeScoreMetric, float),
-        (WordOverlapMetric, float),
-        (VerbPercentMetric, float),
-        (AdjectivePercentMetric, float),
-        (NounPercentMetric, float),
-        (JSONValidityMetric, float),
+        JSONValidityTest,
     ],
 )
-def test_metric_return_type(test_class, expected_type):
+def test_test_return_bool(test_class):
+    """Test to ensure that all tests return pass/fail boolean value."""
     test_instance = test_class()
     prompt = "This is a test prompt."
     ground_truth = "This is a ground truth sentence."
     model_prediction = "This is a model predicted sentence."
 
-    # Depending on the test class, the output could be different.
-    metric_result = test_instance.get_metric(prompt, ground_truth, model_prediction)
-    assert isinstance(
-        metric_result, expected_type
-    ), f"Expected return type {expected_type}, but got {type(metric_result)}."
-
-
-def test_length_metric():
-    test = LengthMetric()
-    result = test.get_metric("prompt", "short text", "longer text")
-    assert result == 1, "Length difference should be 1."
-
-
-def test_jaccard_similarity_metric():
-    test = JaccardSimilarityMetric()
-    result = test.get_metric("prompt", "hello world", "world hello")
-    assert result == 1.0, "Jaccard similarity should be 1.0 for the same words in different orders."
-
-
-def test_dot_product_similarity_metric():
-    test = DotProductSimilarityMetric()
-    result = test.get_metric("prompt", "data", "data")
-    assert result >= 0, "Dot product similarity should be non-negative."
-
-
-def test_rouge_score_metric():
-    test = RougeScoreMetric()
-    result = test.get_metric("prompt", "the quick brown fox", "the quick brown fox jumps over the lazy dog")
-    assert result >= 0, "ROUGE precision should be non-negative."
-
-
-def test_word_overlap_metric():
-    test = WordOverlapMetric()
-    result = test.get_metric("prompt", "jump over the moon", "jump around the sun")
-    assert result >= 0, "Word overlap percentage should be non-negative."
-
-
-def test_verb_percent_metric():
-    test = VerbPercentMetric()
-    result = test.get_metric("prompt", "He eats", "He is eating")
-    assert result >= 0, "Verb percentage should be non-negative."
-
-
-def test_adjective_percent_metric():
-    test = AdjectivePercentMetric()
-    result = test.get_metric("prompt", "It is beautiful", "It is extremely beautiful")
-    assert result >= 0, "Adjective percentage should be non-negative."
-
-
-def test_noun_percent_metric():
-    test = NounPercentMetric()
-    result = test.get_metric("prompt", "The cat", "The cat and the dog")
-    assert result >= 0, "Noun percentage should be non-negative."
+    metric_result = test_instance.test(prompt, ground_truth, model_prediction)
+    assert isinstance(metric_result, bool), f"Expected return type bool, but got {type(metric_result)}."
 
 
 @pytest.mark.parametrize(
     "input_string,expected_value",
     [
-        ('{"Answer": "The cat"}', 1),
-        ("{'Answer': 'The cat'}", 0),  # Double quotes are required in json
-        ('{"Answer": "The cat",}', 0),
-        ('{"Answer": "The cat", "test": "case"}', 1),
-        ('```json\n{"Answer": "The cat"}\n```', 1),  # this json block can still be processed
-        ('Here is an example of a JSON block: {"Answer": "The cat"}', 0),
+        ('{"Answer": "The cat"}', True),
+        ("{'Answer': 'The cat'}", False),  # Double quotes are required in json
+        ('{"Answer": "The cat",}', False),  # Trailing comma is not allowed
+        ('{"Answer": "The cat", "test": "case"}', True),
+        ('```json\n{"Answer": "The cat"}\n```', True),  # this json block can still be processed
+        ('Here is an example of a JSON block: {"Answer": "The cat"}', False),
     ],
 )
-def test_json_valid_metric(input_string: str, expected_value: float):
-    test = JSONValidityMetric()
-    result = test.get_metric("prompt", "The cat", input_string)
+def test_json_valid_metric(input_string: str, expected_value: bool):
+    test = JSONValidityTest()
+    result = test.test("prompt", "The cat", input_string)
     assert result == expected_value, f"JSON validity should be {expected_value} but got {result}."