Skip to content

Commit 260bba5

Browse files
authored
DeepEval CI integration (#86)
* initial scaffold for deepeval integration + remove unnecessary files * add llm_tests target and CI * fix CI syntax, seed env for evaluation * use python3.12 for default workflow * fix CI -> GH_PAT and list_contexts issue * add instructor for enforcing json llm outputs in deepeval * silent deepeval outputs * add json deepeval-cache parser * set two deepeval metrics as notimplemented -protobuf error * populate env variables in 1 step * Copy secret.json to correct path * add sleep to make sure docker comes up and is alive * evaluate all qns * cleanup and use average stats --------- Signed-off-by: Jack Luar <[email protected]>
1 parent b850af4 commit 260bba5

File tree

19 files changed

+326
-97
lines changed

19 files changed

+326
-97
lines changed

.github/workflows/ci.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ jobs:
1111
build-backend-docker:
1212
runs-on: self-hosted
1313
steps:
14+
- name: Setup python
15+
uses: actions/setup-python@v5
16+
with:
17+
python-version: '3.12'
1418
- name: Checkout code
1519
uses: actions/checkout@v4
1620
- name: Setup prereqs
@@ -24,10 +28,23 @@ jobs:
2428
cp backend/.env.example backend/.env
2529
sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env
2630
sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env
31+
cp backend/.env evaluation/.env
32+
cp backend/.env frontend/.env
2733
cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src
34+
cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src
2835
- name: Build Docker image
2936
run: |
3037
make docker
38+
sleep 900 # TODO: Remove this after docker-compose healthcheck timeout restored fixed.
39+
- name: Run LLM CI
40+
working-directory: evaluation
41+
run: |
42+
make llm-tests
43+
- name: Create commit comment
44+
uses: peter-evans/commit-comment@v3
45+
with:
46+
token: ${{ secrets.GH_PAT }}
47+
body-path: evaluation/auto_evaluation/llm_tests_output.txt
3148
- name: Teardown
3249
if: always()
3350
run: |

.gitignore

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ __pycache__/
44
backend/data/*
55
backend/src/*.json
66
*.pyc
7+
*.egg-info/
78
frontend/*.json
89
evaluation/human_evaluation/*.json
910
/*.json
@@ -21,7 +22,8 @@ documents.txt
2122
.venv
2223

2324
# evaluations
24-
.deepeval_telemtry.txt
25+
**/.deepeval_telemtry.txt
2526
*.csv
26-
*.deepeval-cache.json
27+
**/.deepeval-cache.json
2728
temp_test_run_data.json
29+
**/llm_tests_output.txt

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
FOLDERS=backend frontend
1+
.PHONY: init init-dev format check
2+
3+
FOLDERS=backend frontend evaluation
24

35
init:
46
@for folder in $(FOLDERS); do (cd $$folder && make init && cd ../); done

backend/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py
2828

2929
EXPOSE 8000
3030

31-
CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
31+
CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

backend/src/api/routers/graphs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse:
121121
tool_index = 1
122122
for tool in tools:
123123
urls.extend(list(output[tool_index].values())[0]["urls"])
124-
context.extend(list(set(list(output[tool_index].values())[0]["context"])))
124+
context.append(list(output[tool_index].values())[0]["context"])
125125
tool_index += 1
126126
else:
127127
llm_response = "LLM response extraction failed"

backend/src/tools/format_docs.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
77
doc_text = ""
8-
doc_texts = ""
8+
doc_texts = []
99
doc_urls = []
1010
doc_srcs = []
1111

@@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
1919
doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}"
2020
else:
2121
doc_text = doc.page_content
22+
doc_texts.append(doc_text)
2223

2324
if "url" in doc.metadata:
2425
doc_urls.append(doc.metadata["url"])
26+
27+
doc_output = "\n\n -------------------------- \n\n".join(doc_texts)
2528

26-
doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}"
27-
28-
return doc_texts, doc_srcs, doc_urls
29+
return doc_output, doc_srcs, doc_urls

evaluation/Makefile

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
.PHONY: init init-dev format check clean
2+
13
init:
24
@python3 -m venv .venv && \
35
. .venv/bin/activate && \
4-
pip install -r requirements.txt
6+
pip install -r requirements.txt && \
7+
pip install -e .
58

69
init-dev: init
710
@. .venv/bin/activate && \
@@ -15,3 +18,12 @@ format:
1518
check:
1619
@. .venv/bin/activate && \
1720
ruff check --fix
21+
22+
clean:
23+
@rm -f llm_tests_output.txt
24+
@rm -f **/.deepeval-cache.json
25+
26+
llm-tests: clean
27+
@. .venv/bin/activate && \
28+
cd auto_evaluation && \
29+
./llm_tests.sh 2>&1 | tee llm_tests_output.txt

evaluation/auto_evaluation/__init__.py

Whitespace-only changes.

evaluation/auto_evaluation/content_metrics.json

Lines changed: 0 additions & 1 deletion
This file was deleted.

evaluation/auto_evaluation/dataset/hf_pull.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from huggingface_hub import snapshot_download
22
import os
33

4-
if __name__ == "__main__":
4+
5+
def main():
56
cur_dir = os.path.dirname(os.path.abspath(__file__))
67
snapshot_download(
78
"The-OpenROAD-Project/ORAssistant_Public_Evals",
@@ -13,3 +14,7 @@
1314
"README.md",
1415
],
1516
)
17+
18+
19+
if __name__ == "__main__":
20+
main()

0 commit comments

Comments
 (0)