Fix CI tests (#130)

luarss · web-flow · commit 95319573cccc · 2025-06-04T10:13:18.000+08:00
- Instead of custom Deepeval wrappers -> use deepeval first-class `GeminiModel` support. - Note: we cannot use `api_key` because of limitation of free tier accounts. - Bump versions - Fixes #131, - Fixes #129 --------- Signed-off-by: Jack Luar <jluar@precisioninno.com>
diff --git a/.github/workflows/ci-secret.yaml b/.github/workflows/ci-secret.yaml
@@ -35,10 +35,16 @@ jobs:
     - name: Populate environment variables
       run: |
         cp backend/.env.example backend/.env
+        cp backend/.env evaluation/.env
+
         sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env
+        sed -i 's|{{GOOGLE_PROJECT_ID}}|${{ secrets.GOOGLE_PROJECT_ID }}|g' backend/.env
         sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env
-        cp backend/.env evaluation/.env
-        cp backend/.env frontend/.env
+
+        sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' evaluation/.env
+        sed -i 's|{{GOOGLE_PROJECT_ID}}|${{ secrets.GOOGLE_PROJECT_ID }}|g' evaluation/.env
+        sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|auto_evaluation/src/secret.json|g' evaluation/.env
+
         cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src
         cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src
     - name: Build Docker image
diff --git a/Makefile b/Makefile
@@ -1,4 +1,5 @@
 FOLDERS=backend frontend evaluation
+GOOGLE_SECRET_JSON:=$(HOME)/secret.json
 
 .PHONY: init
 init:
@@ -28,6 +29,12 @@ docker-up:
 docker-down:
 	@docker compose down --remove-orphans
 
+# --- Development Commands ---
+.PHONY: seed-credentials
+seed-credentials:
+	@cp $(GOOGLE_SECRET_JSON) backend/src
+	@cp $(GOOGLE_SECRET_JSON) evaluation/auto_evaluation/src
+
 .PHONY: changelog
 changelog:
 	@git log --pretty=format:"%h%x09%an%x09%ad%x09%s" --date=short --since="2024-06-01" > CHANGELOG.md
diff --git a/backend/.dockerignore b/backend/.dockerignore
@@ -0,0 +1,7 @@
+.venv
+*.egg-info
+.mypy-cache
+__pycache__
+faiss_db
+data
+tests
diff --git a/backend/.env.example b/backend/.env.example
@@ -1,4 +1,5 @@
 GOOGLE_API_KEY={{GOOGLE_API_KEY}}
+GOOGLE_PROJECT_ID={{GOOGLE_PROJECT_ID}}
 GOOGLE_APPLICATION_CREDENTIALS={{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}
 
 USE_CUDA=false
diff --git a/evaluation/auto_evaluation/eval_main.py b/evaluation/auto_evaluation/eval_main.py
@@ -11,8 +11,8 @@
 from dotenv import load_dotenv
 from deepeval.test_case import LLMTestCase
 from deepeval import evaluate
+from deepeval.models import GeminiModel
 
-from auto_evaluation.src.models.vertex_ai import GoogleVertexAILangChain
 from auto_evaluation.src.metrics.retrieval import (
     make_contextual_precision_metric,
     make_contextual_recall_metric,
@@ -42,7 +42,11 @@ def __init__(self, base_url: str, dataset: str, reranker_base_url: str = ""):
         self.dataset = dataset
         self.reranker_base_url = reranker_base_url
         self.qns = preprocess.read_data(self.dataset)
-        self.eval_model = GoogleVertexAILangChain(model_name="gemini-1.5-pro-002")
+        self.eval_model = GeminiModel(
+            model_name="gemini-1.5-pro-002",
+            project=os.getenv("GOOGLE_PROJECT_ID", ""),
+            location=os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1"),
+        )
         self.log_dir = "logs"
         os.makedirs(self.log_dir, exist_ok=True)
         self.sanity_check()
@@ -91,8 +95,8 @@ def evaluate(self, retriever: str):
 
         # parallel evaluate
         evaluate(
-            retrieval_tcs,
-            [precision, recall, hallucination],
+            test_cases=retrieval_tcs,
+            metrics=[precision, recall, hallucination],
             print_results=False,
         )
 
diff --git a/evaluation/auto_evaluation/src/metrics/retrieval.py b/evaluation/auto_evaluation/src/metrics/retrieval.py
@@ -46,7 +46,7 @@ def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
     )
 
 
-def make_hallucination_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:
+def make_hallucination_metric(model: DeepEvalBaseLLM) -> HallucinationMetric:
     return HallucinationMetric(
         threshold=HALLUCINATION_THRESHOLD,
         model=model,
diff --git a/evaluation/auto_evaluation/src/models/vertex_ai.py b/evaluation/auto_evaluation/src/models/vertex_ai.py
@@ -60,7 +60,7 @@ async def a_generate(self, prompt: str, schema: Any) -> Any:
             client=self.load_model(),
             mode=instructor.Mode.VERTEXAI_TOOLS,
         )
-        resp = await instructor_client.completions.create(
+        resp = await instructor_client.messages.create(
             messages=[
                 {
                     "role": "user",
diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt
@@ -8,13 +8,12 @@ requests==2.32.3
 requests-oauthlib==2.0.0
 streamlit==1.37.0
 gspread==6.1.2
-deepeval==1.4.9
-langchain-google-vertexai==2.0.6
+deepeval==2.6.8
+langchain-google-vertexai==2.0.15
 asyncio==3.4.3
 huggingface-hub==0.26.2
 instructor[vertexai]==1.5.2
 openai==1.58.1
 pydantic==2.10.4
 tqdm==4.67.1
-vertexai==1.71.1
 plotly==5.24.1
diff --git a/frontend/.dockerignore b/frontend/.dockerignore
@@ -0,0 +1,4 @@
+.venv
+*.egg-info
+.mypy-cache
+__pycache__

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`GOOGLE_API_KEY={{GOOGLE_API_KEY}}`
	`2`	`+GOOGLE_PROJECT_ID={{GOOGLE_PROJECT_ID}}`
`2`	`3`	`GOOGLE_APPLICATION_CREDENTIALS={{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}`
`3`	`4`
`4`	`5`	`USE_CUDA=false`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ def make_faithfulness_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:`
`46`	`46`	`)`
`47`	`47`
`48`	`48`
`49`		`-def make_hallucination_metric(model: DeepEvalBaseLLM) -> FaithfulnessMetric:`
	`49`	`+def make_hallucination_metric(model: DeepEvalBaseLLM) -> HallucinationMetric:`
`50`	`50`	`return HallucinationMetric(`
`51`	`51`	`threshold=HALLUCINATION_THRESHOLD,`
`52`	`52`	`model=model,`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ async def a_generate(self, prompt: str, schema: Any) -> Any:`
`60`	`60`	`client=self.load_model(),`
`61`	`61`	`mode=instructor.Mode.VERTEXAI_TOOLS,`
`62`	`62`	`)`
`63`		`- resp = await instructor_client.completions.create(`
	`63`	`+ resp = await instructor_client.messages.create(`
`64`	`64`	`messages=[`
`65`	`65`	`{`
`66`	`66`	`"role": "user",`