georgia-tech-db · milonimittal · Oct 30, 2025
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
@@ -2,7 +2,7 @@
 from tests.metrics.registry import MetricRegistry
 from tests.metrics.scorer import SimilarityScorer
 from tests.metrics.semantic import SemanticSimilarityMetric
-from tests.metrics.keyword import KeywordMatchMetric
+from tests.metrics.keyword_match import KeywordMatchMetric
 from tests.metrics.nli import NLIEntailmentMetric
 
 __all__ = [

diff --git a/tests/metrics/keyword.py → tests/metrics/keyword_match.py b/tests/metrics/keyword.py → tests/metrics/keyword_match.py
diff --git a/tests/metrics/llm_judge.py b/tests/metrics/llm_judge.py
@@ -0,0 +1,27 @@
+from google import genai
+from google.genai import types
+import httpx
+
+def get_score(question: str, answer:str):
+    client = genai.Client()
+    doc_url = "https://my.uopeople.edu/pluginfile.php/57436/mod_book/chapter/37620/Database%20System%20Concepts%204th%20Edition%20By%20Silberschatz-Korth-Sudarshan.pdf"
+    doc_data = httpx.get(doc_url).content
+
+    prompt = "I am creating an llm pipeline that answers questions from a textbook. I need your help to evaluate that pipeline." \
+    "I have attached the textbook. Read the textbook. I will provide you a question and the answer that my llm generated." \
+    "You need to evaluate the answer and give me a rating out of 5. 5 being excellent. Also, give me a very brief reasoning for why you" \
+    "gave that rating. Use only the textbook to evaluate the answers that my llm generated." + question + answer
+
+    response = client.models.generate_content(
+    model="gemini-2.5-flash",
+    contents=[
+      types.Part.from_bytes(
+        data=doc_data,
+        mime_type='application/pdf',
+      ),
+      prompt])
+    print("***LLM as judge***")
+    print(response.text)
+
+if __name__ == "__main__":
+    get_score("", "")
diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 from datetime import datetime
 from tests.metrics import SimilarityScorer
+import tests.metrics.llm_judge
 
 
 @pytest.mark.filterwarnings("ignore::DeprecationWarning")
@@ -109,6 +110,7 @@ def run_benchmark(benchmark, config, results_dir, scorer):
 
     # Calculate scores
     try:
+        tests.metrics.llm_judge.get_score(question, retrieved_answer)
         scores = scorer.calculate_scores(retrieved_answer, expected_answer, keywords)
     except Exception as e:
         error_msg = f"Scoring error: {e}"
@@ -180,6 +182,7 @@ def get_tokensmith_answer(question, config, golden_chunks=None):
     args = argparse.Namespace(
         index_prefix=config["index_prefix"],
         model_path=config.get("model_path"),
+        system_prompt_mode=config.get("system_prompt_mode"),
     )
 
     # Create QueryPlanConfig from our test config