Merge pull request #783 from parea-ai/chore-update-evals-output-is-list

joschkabraun · web-flow · commit 65809e5c46f4 · 2024-04-23T16:36:02.000-04:00
chore: update evals list parsing
diff --git a/parea/evals/rag/context_query_relevancy.py b/parea/evals/rag/context_query_relevancy.py
@@ -1,6 +1,6 @@
 from typing import Callable, List, Optional
 
-from parea.evals.utils import call_openai, sent_tokenize
+from parea.evals.utils import call_openai, get_context, sent_tokenize
 from parea.schemas.log import Log
 
 
@@ -27,13 +27,7 @@ def context_query_relevancy_factory(
     def context_query_relevancy(log: Log) -> float:
         """Quantifies how much the retrieved context relates to the query."""
         question = log.inputs[question_field]
-        if context_fields:
-            context = "\n".join(log.inputs[context_field] for context_field in context_fields)
-        else:
-            if isinstance(log.output, list):
-                context = "\n".join(log.output)
-            else:
-                context = str(log.output)
+        context = get_context(log, context_fields)
 
         extracted_sentences = call_openai(
             model=model,
diff --git a/parea/evals/rag/context_ranking_listwise.py b/parea/evals/rag/context_ranking_listwise.py
@@ -1,6 +1,6 @@
 from typing import Callable, List, Optional
 
-from parea.evals.utils import call_openai, ndcg
+from parea.evals.utils import call_openai, get_context, ndcg
 from parea.schemas.log import Log
 
 
@@ -99,13 +99,7 @@ def progressive_reranking(query: str, contexts: List[str]) -> List[int]:
     def context_ranking(log: Log) -> float:
         """Quantifies if the retrieved context is ranked by their relevancy by re-ranking the contexts."""
         question = log.inputs[question_field]
-        if context_fields:
-            contexts = [log.inputs[context_field] for context_field in context_fields]
-        else:
-            if isinstance(log.output, list):
-                contexts = log.output
-            else:
-                contexts = [str(log.output)]
+        contexts = get_context(log, context_fields, True)
 
         reranked_indices = progressive_reranking(question, contexts)
 
diff --git a/parea/evals/rag/context_ranking_pointwise.py b/parea/evals/rag/context_ranking_pointwise.py
@@ -1,6 +1,6 @@
 from typing import Callable, List, Optional
 
-from parea.evals.utils import call_openai, safe_json_loads
+from parea.evals.utils import call_openai, get_context, safe_json_loads
 from parea.schemas.log import Log
 
 
@@ -40,13 +40,7 @@ def context_ranking_pointwise_factory(
     def context_ranking_pointwise(log: Log) -> float:
         """Quantifies if the retrieved context is ranked by their relevancy"""
         question = log.inputs[question_field]
-        if context_fields:
-            contexts = [log.inputs[context_field] for context_field in context_fields]
-        else:
-            if isinstance(log.output, list):
-                contexts = log.output
-            else:
-                contexts = [str(log.output)]
+        contexts = get_context(log, context_fields, True)
 
         verifications = []
         for context in contexts:
diff --git a/parea/evals/rag/percent_target_supported_by_context.py b/parea/evals/rag/percent_target_supported_by_context.py
@@ -2,7 +2,7 @@
 
 import re
 
-from parea.evals.utils import call_openai
+from parea.evals.utils import call_openai, get_context
 from parea.schemas.log import Log
 
 
@@ -14,13 +14,8 @@ def percent_target_supported_by_context_factory(
     def percent_target_supported_by_context(log: Log) -> Union[float, None]:
         """Quantifies how many sentences in the target/ground truth are supported by the retrieved context."""
         question = log.inputs[question_field]
-        if context_fields:
-            context = "\n".join(log.inputs[context_field] for context_field in context_fields)
-        else:
-            if isinstance(log.output, list):
-                context = "\n".join(log.output)
-            else:
-                context = str(log.output)
+        context = get_context(log, context_fields)
+
         if (target := log.target) is None:
             return None
 
diff --git a/parea/evals/utils.py b/parea/evals/utils.py
@@ -1,4 +1,4 @@
-from typing import Callable, List, Union
+from typing import Callable, List, Optional, Union
 
 import json
 import warnings
@@ -10,7 +10,7 @@
 from openai import __version__ as openai_version
 
 from parea.parea_logger import parea_logger
-from parea.schemas import EvaluationResult
+from parea.schemas import EvaluationResult, Log
 from parea.schemas.log import Log
 from parea.schemas.models import UpdateLog
 
@@ -179,3 +179,18 @@ def get_tokens(model: str, text: str) -> List[int]:
     except Exception as e:
         print(f"Error encoding text: {e}")
         return []
+
+
+def get_context(log: Log, context_fields: Optional[List[str]] = None, as_list: bool = False) -> str:
+    if context_fields:
+        context_list = [log.inputs[context_field] for context_field in context_fields]
+        return context_list if as_list else "\n".join(context_list)
+    else:
+        context = log.output
+        try:
+            loaded_context = json.loads(log.output)
+            if isinstance(log.output, list):
+                return loaded_context if as_list else "\n".join(loaded_context)
+        except json.JSONDecodeError:
+            pass
+        return [context] if as_list else context
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "parea-ai"
 packages = [{ include = "parea" }]
-version = "0.2.135"
+version = "0.2.136"
 description = "Parea python sdk"
 readme = "README.md"
 authors = ["joel-parea-ai <joel@parea.ai>"]