Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from tests.metrics.registry import MetricRegistry
from tests.metrics.scorer import SimilarityScorer
from tests.metrics.semantic import SemanticSimilarityMetric
from tests.metrics.keyword import KeywordMatchMetric
from tests.metrics.keyword_match import KeywordMatchMetric
from tests.metrics.nli import NLIEntailmentMetric

__all__ = [
Expand Down
File renamed without changes.
27 changes: 27 additions & 0 deletions tests/metrics/llm_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from google import genai
from google.genai import types
import httpx

def get_score(question: str, answer:str):
client = genai.Client()
doc_url = "https://my.uopeople.edu/pluginfile.php/57436/mod_book/chapter/37620/Database%20System%20Concepts%204th%20Edition%20By%20Silberschatz-Korth-Sudarshan.pdf"
doc_data = httpx.get(doc_url).content

prompt = "I am creating an llm pipeline that answers questions from a textbook. I need your help to evaluate that pipeline." \
"I have attached the textbook. Read the textbook. I will provide you a question and the answer that my llm generated." \
"You need to evaluate the answer and give me a rating out of 5. 5 being excellent. Also, give me a very brief reasoning for why you" \
"gave that rating. Use only the textbook to evaluate the answers that my llm generated." + question + answer

response = client.models.generate_content(
model="gemini-2.5-flash",
contents=[
types.Part.from_bytes(
data=doc_data,
mime_type='application/pdf',
),
prompt])
print("***LLM as judge***")
print(response.text)

if __name__ == "__main__":
get_score("", "")
3 changes: 3 additions & 0 deletions tests/test_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path
from datetime import datetime
from tests.metrics import SimilarityScorer
import tests.metrics.llm_judge


@pytest.mark.filterwarnings("ignore::DeprecationWarning")
Expand Down Expand Up @@ -109,6 +110,7 @@ def run_benchmark(benchmark, config, results_dir, scorer):

# Calculate scores
try:
tests.metrics.llm_judge.get_score(question, retrieved_answer)
scores = scorer.calculate_scores(retrieved_answer, expected_answer, keywords)
except Exception as e:
error_msg = f"Scoring error: {e}"
Expand Down Expand Up @@ -180,6 +182,7 @@ def get_tokensmith_answer(question, config, golden_chunks=None):
args = argparse.Namespace(
index_prefix=config["index_prefix"],
model_path=config.get("model_path"),
system_prompt_mode=config.get("system_prompt_mode"),
)

# Create QueryPlanConfig from our test config
Expand Down