Properly tracking eval traces

AlexejPenner · AlexejPenner · commit bf39e5cdc758 · 2025-02-21T10:22:40.000+01:00
diff --git a/llm-complete-guide/pipelines/llm_eval.py b/llm-complete-guide/pipelines/llm_eval.py
@@ -46,13 +46,13 @@ def llm_eval(after: Optional[str] = None) -> None:
         failure_rate_bad_answers,
         failure_rate_bad_immediate_responses,
         failure_rate_good_responses,
-    ) = e2e_evaluation(after=after)
+    ) = e2e_evaluation(tracing_tags=["eval"], after=after)
     (
         average_toxicity_score,
         average_faithfulness_score,
         average_helpfulness_score,
         average_relevance_score,
-    ) = e2e_evaluation_llm_judged(after=after)
+    ) = e2e_evaluation_llm_judged(tracing_tags=["eval"], after=after)
 
     visualize_evaluation_results(
         failure_rate_retrieval,
diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py
@@ -241,7 +241,7 @@ def main(
             query_text,
             model=model,
             use_reranking=use_reranker,
-            model_version_stage="production",
+            model_version_stage="latest",
             tracing_tags=["cli", "dev"],
         )
         console = Console()
diff --git a/llm-complete-guide/steps/eval_e2e.py b/llm-complete-guide/steps/eval_e2e.py
@@ -17,7 +17,7 @@
 import json
 import logging
 import warnings
-from typing import Annotated, Callable, Tuple
+from typing import Annotated, Callable, List, Tuple
 
 # Suppress the specific FutureWarning about clean_up_tokenization_spaces
 warnings.filterwarnings(
@@ -31,7 +31,7 @@
 from litellm import completion
 from pydantic import BaseModel, conint
 from structures import TestResult
-from utils.llm_utils import process_input_with_retrieval
+from utils.llm_utils import get_completion_from_messages, process_input_with_retrieval
 from utils.openai_utils import get_openai_api_key
 from zenml import step
 
@@ -70,7 +70,7 @@
 
 
 def test_content_for_bad_words(
-    item: dict, n_items_retrieved: int = 5
+    item: dict, n_items_retrieved: int = 5, tracing_tags: List[str] = []
 ) -> TestResult:
     """
     Test if responses contain bad words.
@@ -85,7 +85,7 @@ def test_content_for_bad_words(
     question = item["question"]
     bad_words = item["bad_words"]
     response = process_input_with_retrieval(
-        question, n_items_retrieved=n_items_retrieved
+        question, n_items_retrieved=n_items_retrieved, tracing_tags=tracing_tags
     )
     for word in bad_words:
         if word in response:
@@ -99,7 +99,7 @@ def test_content_for_bad_words(
 
 
 def test_response_starts_with_bad_words(
-    item: dict, n_items_retrieved: int = 5
+    item: dict, n_items_retrieved: int = 5, tracing_tags: List[str] = []
 ) -> TestResult:
     """
     Test if responses improperly start with bad words.
@@ -114,7 +114,7 @@ def test_response_starts_with_bad_words(
     question = item["question"]
     bad_words = item["bad_words"]
     response = process_input_with_retrieval(
-        question, n_items_retrieved=n_items_retrieved
+        question, n_items_retrieved=n_items_retrieved, tracing_tags=tracing_tags
     )
     for word in bad_words:
         if response.lower().startswith(word.lower()):
@@ -128,7 +128,7 @@ def test_response_starts_with_bad_words(
 
 
 def test_content_contains_good_words(
-    item: dict, n_items_retrieved: int = 5
+    item: dict, n_items_retrieved: int = 5, tracing_tags: List[str] = []
 ) -> TestResult:
     """
     Test if responses properly contain good words.
@@ -143,7 +143,7 @@ def test_content_contains_good_words(
     question = item["question"]
     good_words = item["good_words"]
     response = process_input_with_retrieval(
-        question, n_items_retrieved=n_items_retrieved
+        question, n_items_retrieved=n_items_retrieved, tracing_tags=tracing_tags
     )
     for word in good_words:
         if word not in response:
@@ -179,6 +179,7 @@ def llm_judged_test_e2e(
     question: str,
     context: str,
     n_items_retrieved: int = 5,
+    tracing_tags: List[str] = []
 ) -> LLMJudgedTestResult:
     """E2E tests judged by an LLM.
 
@@ -191,7 +192,7 @@ def llm_judged_test_e2e(
     """
     logging.debug("Starting LLM judged test...")
     response = process_input_with_retrieval(
-        question, n_items_retrieved=n_items_retrieved
+        question, n_items_retrieved=n_items_retrieved, tracing_tags=tracing_tags
     )
     logging.debug("Input processed with retrieval.")
     prompt = f"""
@@ -217,13 +218,12 @@ def llm_judged_test_e2e(
     }}
     """
     logging.debug("Prompt created.")
-    response = completion(
-        model="gpt-4-turbo",
+    json_output = get_completion_from_messages(
         messages=[{"content": prompt, "role": "user"}],
-        api_key=get_openai_api_key(),
-    )
+        model="gpt-4-turbo",
+        tracing_tags=tracing_tags,
+    ).strip()
 
-    json_output = response["choices"][0]["message"]["content"].strip()
     logging.info("Received response from model.")
     logging.debug(json_output)
     try:
@@ -234,8 +234,9 @@ def llm_judged_test_e2e(
 
 
 def run_llm_judged_tests(
-    test_function: Callable,
+    test_function: Callable[[str, str, int, List[str]], LLMJudgedTestResult],
     sample_size: int = 10,
+    tracing_tags: List[str] = []
 ) -> Tuple[
     Annotated[float, "average_toxicity_score"],
     Annotated[float, "average_faithfulness_score"],
@@ -248,6 +249,7 @@ def run_llm_judged_tests(
         test_data (list): The test data.
         test_function (function): The test function to run.
         sample_size (int): The sample size to run the tests on.
+        tracing_tags: Tracing tags used for langfuse
 
     Returns:
         Tuple: The average toxicity, faithfulness, helpfulness, and relevance scores.
@@ -270,7 +272,7 @@ def run_llm_judged_tests(
         context = item["page_content"]
 
         try:
-            result = test_function(question, context)
+            result = test_function(question=question, context=context, tracing_tags=tracing_tags)
         except json.JSONDecodeError as e:
             logging.error(f"Failed for question: {question}. Error: {e}")
             total_tests -= 1
@@ -296,7 +298,11 @@ def run_llm_judged_tests(
     )
 
 
-def run_simple_tests(test_data: list, test_function: Callable) -> float:
+def run_simple_tests(
+    test_data: list, 
+    test_function: Callable, 
+    tracing_tags: List[str] = []
+    ) -> float:
     """
     Run tests for bad answers.
 
@@ -310,7 +316,7 @@ def run_simple_tests(test_data: list, test_function: Callable) -> float:
     failures = 0
     total_tests = len(test_data)
     for item in test_data:
-        test_result = test_function(item)
+        test_result = test_function(item, tracing_tags=tracing_tags)
         if not test_result.success:
             logging.error(
                 f"Test failed for question: '{test_result.question}'. Found word: '{test_result.keyword}'. Response: '{test_result.response}'"
@@ -324,29 +330,31 @@ def run_simple_tests(test_data: list, test_function: Callable) -> float:
 
 
 @step
-def e2e_evaluation() -> Tuple[
+def e2e_evaluation(
+    tracing_tags: List[str] = []
+    ) -> Tuple[
     Annotated[float, "failure_rate_bad_answers"],
     Annotated[float, "failure_rate_bad_immediate_responses"],
     Annotated[float, "failure_rate_good_responses"],
 ]:
     """Executes the end-to-end evaluation step."""
     logging.info("Testing bad answers...")
     failure_rate_bad_answers = run_simple_tests(
-        bad_answers, test_content_for_bad_words
+        bad_answers, test_content_for_bad_words, tracing_tags=tracing_tags
     )
     logging.info(f"Bad answers failure rate: {failure_rate_bad_answers}%")
 
     logging.info("Testing bad immediate responses...")
     failure_rate_bad_immediate_responses = run_simple_tests(
-        bad_immediate_responses, test_response_starts_with_bad_words
+        bad_immediate_responses, test_response_starts_with_bad_words, tracing_tags=tracing_tags
     )
     logging.info(
         f"Bad immediate responses failure rate: {failure_rate_bad_immediate_responses}%"
     )
 
     logging.info("Testing good responses...")
     failure_rate_good_responses = run_simple_tests(
-        good_responses, test_content_contains_good_words
+        good_responses, test_content_contains_good_words, tracing_tags=tracing_tags
     )
     logging.info(
         f"Good responses failure rate: {failure_rate_good_responses}%"
@@ -359,7 +367,9 @@ def e2e_evaluation() -> Tuple[
 
 
 @step
-def e2e_evaluation_llm_judged() -> Tuple[
+def e2e_evaluation_llm_judged(
+    tracing_tags: List[str] = []
+) -> Tuple[
     Annotated[float, "average_toxicity_score"],
     Annotated[float, "average_faithfulness_score"],
     Annotated[float, "average_helpfulness_score"],
@@ -376,7 +386,7 @@ def e2e_evaluation_llm_judged() -> Tuple[
         average_faithfulness_score,
         average_helpfulness_score,
         average_relevance_score,
-    ) = run_llm_judged_tests(llm_judged_test_e2e)
+    ) = run_llm_judged_tests(llm_judged_test_e2e, tracing_tags=tracing_tags)
     return (
         average_toxicity_score,
         average_faithfulness_score,
diff --git a/llm-complete-guide/utils/llm_utils.py b/llm-complete-guide/utils/llm_utils.py
@@ -579,7 +579,7 @@ def get_completion_from_messages(
     temperature=0,
     max_tokens=1000,
     tracing_tags: List[str] = [],
-):
+) -> str:
     """Generates a completion response from the given messages using the specified model.
 
     Args:

Original file line number	Diff line number	Diff line change
`@@ -241,7 +241,7 @@ def main(`
`241`	`241`	`query_text,`
`242`	`242`	`model=model,`
`243`	`243`	`use_reranking=use_reranker,`
`244`		`- model_version_stage="production",`
	`244`	`+ model_version_stage="latest",`
`245`	`245`	`tracing_tags=["cli", "dev"],`
`246`	`246`	`)`
`247`	`247`	`console = Console()`