The-OpenROAD-Project · luarss · Nov 10, 2024 · Nov 7, 2024 · Nov 9, 2024 · Nov 10, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -11,6 +11,10 @@ jobs:
   build-backend-docker:
     runs-on: self-hosted
     steps:
+    - name: Setup python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
     - name: Checkout code
       uses: actions/checkout@v4
     - name: Setup prereqs
@@ -24,10 +28,23 @@ jobs:
         cp backend/.env.example backend/.env
         sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env
         sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env
+        cp backend/.env evaluation/.env
+        cp backend/.env frontend/.env
         cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src
+        cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src
     - name: Build Docker image
       run: |
         make docker
+        sleep 900 # TODO: Remove this after docker-compose healthcheck timeout restored fixed.
+    - name: Run LLM CI
+      working-directory: evaluation
+      run: |
+        make llm-tests
+    - name: Create commit comment
+      uses: peter-evans/commit-comment@v3
+      with:
+        token: ${{ secrets.GH_PAT }}
+        body-path: evaluation/auto_evaluation/llm_tests_output.txt
     - name: Teardown
       if: always()
       run: |

diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ __pycache__/
 backend/data/*
 backend/src/*.json
 *.pyc
+*.egg-info/
 frontend/*.json  
 evaluation/human_evaluation/*.json  
 /*.json
@@ -21,7 +22,8 @@ documents.txt
 .venv
 
 # evaluations
-.deepeval_telemtry.txt
+**/.deepeval_telemtry.txt
 *.csv
-*.deepeval-cache.json
+**/.deepeval-cache.json
 temp_test_run_data.json
+**/llm_tests_output.txt
diff --git a/Makefile b/Makefile
@@ -1,4 +1,6 @@
-FOLDERS=backend frontend
+.PHONY: init init-dev format check
+
+FOLDERS=backend frontend evaluation
 
 init:
 	@for folder in $(FOLDERS); do (cd $$folder && make init && cd ../); done

diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py
 
 EXPOSE 8000
 
-CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
diff --git a/backend/src/api/routers/graphs.py b/backend/src/api/routers/graphs.py
@@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse:
         tool_index = 1
         for tool in tools:
             urls.extend(list(output[tool_index].values())[0]["urls"])
-            context.extend(list(set(list(output[tool_index].values())[0]["context"])))
+            context.append(list(output[tool_index].values())[0]["context"])
             tool_index += 1
     else:
         llm_response = "LLM response extraction failed"

diff --git a/backend/src/tools/format_docs.py b/backend/src/tools/format_docs.py
@@ -5,7 +5,7 @@
 
 def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
     doc_text = ""
-    doc_texts = ""
+    doc_texts = []
     doc_urls = []
     doc_srcs = []
 
@@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
                 doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}"
             else:
                 doc_text = doc.page_content
+            doc_texts.append(doc_text)
 
         if "url" in doc.metadata:
             doc_urls.append(doc.metadata["url"])
+
+    doc_output = "\n\n -------------------------- \n\n".join(doc_texts)
 
-        doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}"
-
-    return doc_texts, doc_srcs, doc_urls
+    return doc_output, doc_srcs, doc_urls
diff --git a/evaluation/Makefile b/evaluation/Makefile
@@ -1,7 +1,10 @@
+.PHONY: init init-dev format check clean
+
 init:
 	@python3 -m venv .venv && \
 		. .venv/bin/activate && \
-		pip install -r requirements.txt
+		pip install -r requirements.txt && \
+		pip install -e .
 
 init-dev: init
 	@. .venv/bin/activate && \
@@ -15,3 +18,12 @@ format:
 check:
 	@. .venv/bin/activate && \
 		ruff check --fix
+
+clean:
+	@rm -f llm_tests_output.txt
+	@rm -f **/.deepeval-cache.json
+
+llm-tests: clean
+	@. .venv/bin/activate && \
+		cd auto_evaluation && \
+		./llm_tests.sh 2>&1 | tee llm_tests_output.txt
diff --git a/evaluation/auto_evaluation/__init__.py b/evaluation/auto_evaluation/__init__.py
diff --git a/evaluation/auto_evaluation/content_metrics.json b/evaluation/auto_evaluation/content_metrics.json
diff --git a/evaluation/auto_evaluation/dataset/hf_pull.py b/evaluation/auto_evaluation/dataset/hf_pull.py
@@ -1,7 +1,8 @@
 from huggingface_hub import snapshot_download
 import os
 
-if __name__ == "__main__":
+
+def main():
     cur_dir = os.path.dirname(os.path.abspath(__file__))
     snapshot_download(
         "The-OpenROAD-Project/ORAssistant_Public_Evals",
@@ -13,3 +14,7 @@
             "README.md",
         ],
     )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/auto_evaluation/dataset/preprocess.py b/evaluation/auto_evaluation/dataset/preprocess.py
@@ -0,0 +1,60 @@
+import csv
+import json
+from typing import Any
+
+
+def read_data(csv_file: str) -> list[dict]:
+    questions = []
+    with open(csv_file, "r") as f:
+        reader = csv.reader(f)
+        header = next(reader)  # Skip the header row
+        assert len(header) == 2, "CSV file must have exactly 2 columns"
+        for row in reader:
+            questions.append(
+                {"question": row[0].strip(), "ground_truth": row[1].strip()}
+            )
+    return questions
+
+
+def write_data(results_list: list[dict[str, Any]], results_path: str):
+    keys = results_list[0].keys()
+    with open(results_path, "w") as f:
+        writer = csv.writer(f)
+        writer.writerow(list(keys))
+        for result in results_list:
+            writer.writerow([result[key] for key in keys])
+    print(f"Results written to {results_path}")
+
+
+def read_deepeval_cache():
+    metric_scores = {
+        "Contextual Precision": [],
+        "Contextual Recall": [],
+        "Hallucination": [],
+    }
+    metric_passes = {
+        "Contextual Precision": [],
+        "Contextual Recall": [],
+        "Hallucination": [],
+    }
+    with open(".deepeval-cache.json") as f:
+        results = json.load(f)
+    for _, value in results["test_cases_lookup_map"].items():
+        for metric in value["cached_metrics_data"]:
+            metric_scores[metric["metric_data"]["name"]].append(
+                metric["metric_data"]["score"]
+            )
+            metric_passes[metric["metric_data"]["name"]].append(
+                metric["metric_data"]["success"]
+            )
+
+    print("Average Metric Scores: ")
+    for key, value in metric_scores.items():
+        print(key, sum(value) / len(value))
+    print("Metric Passrates: ")
+    for key, value in metric_passes.items():
+        print(key, value.count(True) / len(value))
+
+
+if __name__ == "__main__":
+    read_deepeval_cache()
diff --git a/evaluation/auto_evaluation/demo.py b/evaluation/auto_evaluation/demo.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py

		EXPOSE 8000

		CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
		CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]