Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ jobs:
build-backend-docker:
runs-on: self-hosted
steps:
- name: Setup python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Checkout code
uses: actions/checkout@v4
- name: Setup prereqs
Expand All @@ -24,10 +28,23 @@ jobs:
cp backend/.env.example backend/.env
sed -i 's|{{GOOGLE_API_KEY}}|${{ secrets.GOOGLE_API_KEY }}|g' backend/.env
sed -i 's|{{PATH_TO_GOOGLE_APPLICATION_CREDENTIALS}}|src/secret.json|g' backend/.env
cp backend/.env evaluation/.env
cp backend/.env frontend/.env
cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} backend/src
cp ${{ secrets.PATH_TO_GOOGLE_APPLICATION_CREDENTIALS }} evaluation/auto_evaluation/src
- name: Build Docker image
run: |
make docker
sleep 900 # TODO: Remove this after docker-compose healthcheck timeout restored fixed.
- name: Run LLM CI
working-directory: evaluation
run: |
make llm-tests
- name: Create commit comment
uses: peter-evans/commit-comment@v3
with:
token: ${{ secrets.GH_PAT }}
body-path: evaluation/auto_evaluation/llm_tests_output.txt
- name: Teardown
if: always()
run: |
Expand Down
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ __pycache__/
backend/data/*
backend/src/*.json
*.pyc
*.egg-info/
frontend/*.json
evaluation/human_evaluation/*.json
/*.json
Expand All @@ -21,7 +22,8 @@ documents.txt
.venv

# evaluations
.deepeval_telemtry.txt
**/.deepeval_telemtry.txt
*.csv
*.deepeval-cache.json
**/.deepeval-cache.json
temp_test_run_data.json
**/llm_tests_output.txt
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
FOLDERS=backend frontend
.PHONY: init init-dev format check

FOLDERS=backend frontend evaluation

init:
@for folder in $(FOLDERS); do (cd $$folder && make init && cd ../); done
Expand Down
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ RUN python /ORAssistant-backend/src/post_install.py

EXPOSE 8000

CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
2 changes: 1 addition & 1 deletion backend/src/api/routers/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ async def get_agent_response(user_input: UserInput) -> ChatResponse:
tool_index = 1
for tool in tools:
urls.extend(list(output[tool_index].values())[0]["urls"])
context.extend(list(set(list(output[tool_index].values())[0]["context"])))
context.append(list(output[tool_index].values())[0]["context"])
tool_index += 1
else:
llm_response = "LLM response extraction failed"
Expand Down
9 changes: 5 additions & 4 deletions backend/src/tools/format_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
doc_text = ""
doc_texts = ""
doc_texts = []
doc_urls = []
doc_srcs = []

Expand All @@ -19,10 +19,11 @@ def format_docs(docs: list[Document]) -> tuple[str, list[str], list[str]]:
doc_text = f"{gh_discussion_prompt_template}\n\n{doc.page_content}"
else:
doc_text = doc.page_content
doc_texts.append(doc_text)

if "url" in doc.metadata:
doc_urls.append(doc.metadata["url"])

doc_output = "\n\n -------------------------- \n\n".join(doc_texts)

doc_texts += f"\n\n- - - - - - - - - - - - - - - \n\n{doc_text}"

return doc_texts, doc_srcs, doc_urls
return doc_output, doc_srcs, doc_urls
14 changes: 13 additions & 1 deletion evaluation/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
.PHONY: init init-dev format check clean

init:
@python3 -m venv .venv && \
. .venv/bin/activate && \
pip install -r requirements.txt
pip install -r requirements.txt && \
pip install -e .

init-dev: init
@. .venv/bin/activate && \
Expand All @@ -15,3 +18,12 @@ format:
check:
@. .venv/bin/activate && \
ruff check --fix

clean:
@rm -f llm_tests_output.txt
@rm -f **/.deepeval-cache.json

llm-tests: clean
@. .venv/bin/activate && \
cd auto_evaluation && \
./llm_tests.sh 2>&1 | tee llm_tests_output.txt
Empty file.
1 change: 0 additions & 1 deletion evaluation/auto_evaluation/content_metrics.json

This file was deleted.

7 changes: 6 additions & 1 deletion evaluation/auto_evaluation/dataset/hf_pull.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from huggingface_hub import snapshot_download
import os

if __name__ == "__main__":

def main():
cur_dir = os.path.dirname(os.path.abspath(__file__))
snapshot_download(
"The-OpenROAD-Project/ORAssistant_Public_Evals",
Expand All @@ -13,3 +14,7 @@
"README.md",
],
)


if __name__ == "__main__":
main()
60 changes: 60 additions & 0 deletions evaluation/auto_evaluation/dataset/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import csv
import json
from typing import Any


def read_data(csv_file: str) -> list[dict]:
questions = []
with open(csv_file, "r") as f:
reader = csv.reader(f)
header = next(reader) # Skip the header row
assert len(header) == 2, "CSV file must have exactly 2 columns"
for row in reader:
questions.append(
{"question": row[0].strip(), "ground_truth": row[1].strip()}
)
return questions


def write_data(results_list: list[dict[str, Any]], results_path: str):
keys = results_list[0].keys()
with open(results_path, "w") as f:
writer = csv.writer(f)
writer.writerow(list(keys))
for result in results_list:
writer.writerow([result[key] for key in keys])
print(f"Results written to {results_path}")


def read_deepeval_cache():
metric_scores = {
"Contextual Precision": [],
"Contextual Recall": [],
"Hallucination": [],
}
metric_passes = {
"Contextual Precision": [],
"Contextual Recall": [],
"Hallucination": [],
}
with open(".deepeval-cache.json") as f:
results = json.load(f)
for _, value in results["test_cases_lookup_map"].items():
for metric in value["cached_metrics_data"]:
metric_scores[metric["metric_data"]["name"]].append(
metric["metric_data"]["score"]
)
metric_passes[metric["metric_data"]["name"]].append(
metric["metric_data"]["success"]
)

print("Average Metric Scores: ")
for key, value in metric_scores.items():
print(key, sum(value) / len(value))
print("Metric Passrates: ")
for key, value in metric_passes.items():
print(key, value.count(True) / len(value))


if __name__ == "__main__":
read_deepeval_cache()
64 changes: 0 additions & 64 deletions evaluation/auto_evaluation/demo.py

This file was deleted.

Loading