fix test errors for qa tests

benjaminye · benjaminye · commit bfc52d165bd0 · 2024-04-30T12:14:12.000-04:00
diff --git a/llmtune/qa/generics.py b/llmtune/qa/generics.py
@@ -48,7 +48,7 @@ def __init__(
         self.ground_truths = ground_truths
         self.model_preds = model_preds
 
-        self.test_results = {}
+        self._results = {}
 
     @staticmethod
     def from_csv(file_path: str, tests: List[LLMQaTest]) -> "LLMTestSuite":
@@ -60,29 +60,29 @@ def from_csv(file_path: str, tests: List[LLMQaTest]) -> "LLMTestSuite":
 
     def run_tests(self) -> Dict[str, List[Union[float, int, bool]]]:
         test_results = {}
-        for test in zip(self.tests):
+        for test in self.tests:
             metrics = []
             for prompt, ground_truth, model_pred in zip(self.prompts, self.ground_truths, self.model_preds):
                 metrics.append(test.get_metric(prompt, ground_truth, model_pred))
             test_results[test.test_name] = metrics
 
-        self.test_results = test_results
+        self._results = test_results
         return test_results
 
     @property
     def test_results(self):
-        return self.test_results if self.test_results else self.run_tests()
+        return self._results if self._results else self.run_tests()
 
     def print_test_results(self):
-        result_dictionary = self.test_results()
+        result_dictionary = self.test_results
         column_data = {key: list(result_dictionary[key]) for key in result_dictionary}
         mean_values = {key: statistics.mean(column_data[key]) for key in column_data}
         median_values = {key: statistics.median(column_data[key]) for key in column_data}
         stdev_values = {key: statistics.stdev(column_data[key]) for key in column_data}
         # Use the RichUI class to display the table
-        RichUI.display_table(result_dictionary, mean_values, median_values, stdev_values)
+        RichUI.qa_display_table(result_dictionary, mean_values, median_values, stdev_values)
 
     def save_test_results(self, path: str):
         # TODO: save these!
-        resultant_dataframe = pd.DataFrame(self.test_results())
+        resultant_dataframe = pd.DataFrame(self.test_results)
         resultant_dataframe.to_csv(path, index=False)
diff --git a/llmtune/qa/qa_tests.py b/llmtune/qa/qa_tests.py
@@ -9,7 +9,7 @@
 from rouge_score import rouge_scorer
 from transformers import DistilBertModel, DistilBertTokenizer
 
-from llmtune.qa.generics import LLMQaTest, TestRegistry
+from llmtune.qa.generics import LLMQaTest, QaTestRegistry
 
 
 model_name = "distilbert-base-uncased"
@@ -21,7 +21,7 @@
 nltk.download("averaged_perceptron_tagger")
 
 
-@TestRegistry.register("summary_length")
+@QaTestRegistry.register("summary_length")
 class LengthTest(LLMQaTest):
     @property
     def test_name(self) -> str:
@@ -31,7 +31,7 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> U
         return abs(len(ground_truth) - len(model_prediction))
 
 
-@TestRegistry.register("jaccard_similarity")
+@QaTestRegistry.register("jaccard_similarity")
 class JaccardSimilarityTest(LLMQaTest):
     @property
     def test_name(self) -> str:
@@ -48,7 +48,7 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> U
         return similarity
 
 
-@TestRegistry.register("dot_product")
+@QaTestRegistry.register("dot_product")
 class DotProductSimilarityTest(LLMQaTest):
     @property
     def test_name(self) -> str:
@@ -67,7 +67,7 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> U
         return dot_product_similarity
 
 
-@TestRegistry.register("rouge_score")
+@QaTestRegistry.register("rouge_score")
 class RougeScoreTest(LLMQaTest):
     @property
     def test_name(self) -> str:
@@ -79,7 +79,7 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> U
         return float(scores["rouge1"].precision)
 
 
-@TestRegistry.register("word_overlap")
+@QaTestRegistry.register("word_overlap")
 class WordOverlapTest(LLMQaTest):
     @property
     def test_name(self) -> str:
@@ -103,6 +103,7 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> U
         return overlap_percentage
 
 
+@QaTestRegistry.register("verb_percent")
 class PosCompositionTest(LLMQaTest):
     def _get_pos_percent(self, text: str, pos_tags: List[str]) -> float:
         words = word_tokenize(text)
@@ -112,7 +113,7 @@ def _get_pos_percent(self, text: str, pos_tags: List[str]) -> float:
         return round(len(pos_words) / total_words, 2)
 
 
-@TestRegistry.register("verb_percent")
+@QaTestRegistry.register("verb_percent")
 class VerbPercent(PosCompositionTest):
     @property
     def test_name(self) -> str:
@@ -122,7 +123,7 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> f
         return self._get_pos_percent(model_prediction, ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"])
 
 
-@TestRegistry.register("adjective_percent")
+@QaTestRegistry.register("adjective_percent")
 class AdjectivePercent(PosCompositionTest):
     @property
     def test_name(self) -> str:
@@ -132,7 +133,7 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> f
         return self._get_pos_percent(model_prediction, ["JJ", "JJR", "JJS"])
 
 
-@TestRegistry.register("noun_percent")
+@QaTestRegistry.register("noun_percent")
 class NounPercent(PosCompositionTest):
     @property
     def test_name(self) -> str:
diff --git a/llmtune/ui/rich_ui.py b/llmtune/ui/rich_ui.py
@@ -182,7 +182,7 @@ def qa_found():
         pass
 
     @staticmethod
-    def qa_display_table(self, result_dictionary, mean_values, median_values, stdev_values):
+    def qa_display_table(result_dictionary, mean_values, median_values, stdev_values):
         # Create a table
         table = Table(show_header=True, header_style="bold", title="Test Results")