Merge pull request #47 from ansible/aap_38439

justjais · web-flow · commit 265d1c6dbb54 · 2025-01-31T18:30:23.000+05:30
PR to apply for E2E OLS evaluation framework for AAP chatbot
diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md
@@ -11,6 +11,7 @@ Currently we have 2 types of evaluations.
 - QnAs were generated from OCP docs by LLMs. It is possible that some of the questions/answers are not entirely correct. We are constantly trying to verify both Questions & Answers manually. If you find any QnA pair to be modified or removed, please create a PR.
 - OLS API should be ready/live with all the required provider+model configured.
 - It is possible that we want to run both consistency and model evaluation together. To avoid multiple API calls for same query, *model* evaluation first checks .csv file generated by *consistency* evaluation. If response is not present in csv file, then only we call API to get the response.
+- User needs to install python `matplotlib`, and `rouge_score` before running the evaluation.
 
 ### e2e test case
 
@@ -21,6 +22,11 @@ These evaluations are also part of **e2e test cases**. Currently *consistency* e
 python -m scripts.evaluation.driver
 ```
 
+### Sample run command
+```
+OPENAI_API_KEY=IGNORED python -m scripts.evaluation.driver --qna_pool_file ./scripts/evaluation/eval_data/aap-sample.parquet --eval_provider_model_id my_rhoai+granite3-8b --eval_metrics answer_relevancy answer_similarity_llm cos_score rougeL_precision --eval_modes vanilla --judge_model granite3-8b --judge_provider my_rhoai3 --eval_query_ids qna1
+```
+
 ### Input Data/QnA pool
 [Json file](eval_data/question_answer_pair.json)
 
diff --git a/scripts/evaluation/eval_data/aap-sample.parquet b/scripts/evaluation/eval_data/aap-sample.parquet
diff --git a/scripts/evaluation/eval_data/aap.parquet b/scripts/evaluation/eval_data/aap.parquet
diff --git a/scripts/evaluation/olsconfig.yaml b/scripts/evaluation/olsconfig.yaml
@@ -0,0 +1,56 @@
+# olsconfig.yaml sample for local ollama server
+#
+# 1. install local ollama server from https://ollama.com/
+# 2. install llama3.1:latest model with:
+#       ollama pull llama3.1:latest
+# 3. Copy this file to the project root of cloned lightspeed-service repo
+# 4. Install dependencies with:
+#       make install-deps
+# 5. Start lightspeed-service with:
+#       OPENAI_API_KEY=IGNORED make run
+# 6. Open https://localhost:8080/ui in your web browser
+#
+llm_providers:
+  - name: ollama
+    type: openai
+    url: "http://localhost:11434/v1/"
+    models:
+      - name: "mistral"
+      - name: 'llama3.2:latest'
+  - name: my_rhoai
+    type: openai
+    url: "https://granite3-8b-wisdom-model-staging.apps.stage2-west.v2dz.p1.openshiftapps.com/v1"
+    credentials_path: ols_api_key.txt
+    models:
+      - name: granite3-8b
+ols_config:
+  # max_workers: 1
+  reference_content:
+    # product_docs_index_path: "./vector_db/vector_db/aap_product_docs/2.5"
+    # product_docs_index_id: aap-product-docs-2_5
+    # embeddings_model_path: "./vector_db/embeddings_model"
+  conversation_cache:
+    type: memory
+    memory:
+      max_entries: 1000
+  logging_config:
+    app_log_level: info
+    lib_log_level: warning
+    uvicorn_log_level: info
+  default_provider: ollama
+  default_model: 'llama3.2:latest'
+  query_validation_method: llm
+  user_data_collection:
+    feedback_disabled: false
+    feedback_storage: "/tmp/data/feedback"
+    transcripts_disabled: false
+    transcripts_storage: "/tmp/data/transcripts"
+dev_config:
+  # config options specific to dev environment - launching OLS in local
+  enable_dev_ui: true
+  disable_auth: true
+  disable_tls: true
+  pyroscope_url: "https://pyroscope.pyroscope.svc.cluster.local:4040"
+  # llm_params:
+  #   temperature_override: 0
+  # k8s_auth_token: optional_token_when_no_available_kube_config
diff --git a/scripts/evaluation/utils/constants.py b/scripts/evaluation/utils/constants.py
@@ -11,6 +11,8 @@
     "azure_openai+gpt-4o": ("azure_openai", "gpt-4o"),
     "ollama+llama3.1:latest": ("ollama", "llama3.1:latest"),
     "ollama+mistral": ("ollama", "mistral"),
+    "my_rhoai+granite3-8b": ("my_rhoai", "granite3-8b"),
+    "my_rhoai3+granite3-1-8b": ("my_rhoai3", "granite3-1-8b"),
 }
 
 NON_LLM_EVALS = {
diff --git a/scripts/evaluation/utils/relevancy_score.py b/scripts/evaluation/utils/relevancy_score.py
@@ -42,7 +42,7 @@ def get_score(
                     # raise
             sleep(time_to_breath)
 
-        if out:
+        if out and isinstance(out, dict):
             valid_flag = out["Valid"]
             gen_questions = out["Question"]
             score = 0

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,8 @@`
`11`	`11`	`"azure_openai+gpt-4o": ("azure_openai", "gpt-4o"),`
`12`	`12`	`"ollama+llama3.1:latest": ("ollama", "llama3.1:latest"),`
`13`	`13`	`"ollama+mistral": ("ollama", "mistral"),`
	`14`	`+ "my_rhoai+granite3-8b": ("my_rhoai", "granite3-8b"),`
	`15`	`+ "my_rhoai3+granite3-1-8b": ("my_rhoai3", "granite3-1-8b"),`
`14`	`16`	`}`
`15`	`17`
`16`	`18`	`NON_LLM_EVALS = {`