Skip to content

Commit d13a41f

Browse files
authored
Merge pull request #6 from jpodivin/automation
Automation for evaluating Log Detective
2 parents 2d5b4f7 + e344d23 commit d13a41f

File tree

4 files changed

+268
-1
lines changed

4 files changed

+268
-1
lines changed

README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,45 @@ notes: |
4343
Was I wrong?
4444
api: /analysis/staged
4545
```
46+
47+
48+
## Automated evaluation
49+
50+
Evaluation of Log Detective performance can be performed automatically using
51+
the `validation.py`script. Dependencies for the tool are defined in the
52+
`requirements.txt` file and should be installed in a virtual environment.
53+
54+
Before running the script, the API key for the LLM judge must be set
55+
in an environment variable `OPENAI_API_KEY`.
56+
57+
Example:
58+
59+
```
60+
./validation.py <DATA_PATH> <LOG_DETECTIVE_URL> <LLM_URL> <LLM_NAME>
61+
```
62+
Script sends each of the the stored log files for evaluation by Log Detective,
63+
then submits both results of final analysis from Log Detective and actual issue
64+
in the log to LLM to determine similarity of the two.
65+
66+
Scores are assigned on scale from `1` to `10`. Where `10` stands for absolute and
67+
`1` for no match at all.
68+
69+
Example:
70+
71+
```
72+
[Expected Response]
73+
Build failed due to missing patch file `gnome-shell-notify-gnome-session.patch`.
74+
TFixing the issue, requires making sure that all patch files specified in the `SOURCES` directory.
75+
76+
77+
[Actual Response]
78+
The RPM build failed because the patch file `gnome-shell-notify-gnome-session.patch` was missing from the `SOURCES` directory during the `buildsrpm` phase. This caused the `rpmbuild -bs` command to fail.
79+
80+
To resolve this, ensure that the `gnome-shell-notify-gnome-session.patch` file is present in the `SOURCES` directory and is correctly referenced in the RPM spec file.
81+
82+
83+
Similarity Score: 8/10
84+
--------------------------------------------------------------------------------
85+
```
86+
87+
Scores higher or equal to 6 are considered sufficient for passing.

data/89460881-ab96-4597-905c-d0f8f48b9ef8/sample_metadata.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ log_detective_analysis: |
99
This indicates the file is missing at the specified URL. The issue could be a temporary outage, incorrect URL, or the file was intentionally removed.
1010
Verify the URL in the spec file, check for typos, and ensure the file exists at the specified location.
1111
If the URL is dynamically generated, ensure that the variables are correctly defined and resolved. Consider adding a retry mechanism or using a mirror if the issue persists.
12-
log_file: root.log
12+
log_file: builder-live.log

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
pydantic==2.11.7
2+
requests==2.32.4
3+
openai==1.99.9
4+
pyyaml==6.0.2

validation.py

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
#!/usr/bin/env python3
2+
3+
import os
4+
import sys
5+
import requests
6+
import yaml
7+
import openai
8+
import argparse
9+
from pydantic import BaseModel, Field, ValidationError
10+
11+
# --- Configuration ---
12+
# Set your OpenAI API key as an environment variable named OPENAI_API_KEY
13+
# You can get a key from https://beta.openai.com/account/api-keys
14+
API_KEY = os.getenv("OPENAI_API_KEY")
15+
LOG_REPO_BASE_URL = (
16+
"https://raw.githubusercontent.com/fedora-copr/logdetective-sample/main/data/"
17+
)
18+
19+
20+
class SimilarityScore(BaseModel):
21+
"""Defines the structure for the similarity score response from the LLM."""
22+
23+
score: int = Field(
24+
..., ge=1, le=10, description="The similarity score from 1 to 10."
25+
)
26+
27+
28+
def get_similarity_score(
29+
expected_text: str, actual_text: str, llm_client: openai.OpenAI, llm_model: str
30+
) -> int:
31+
"""
32+
Uses a Large Language Model to score the similarity between two texts.
33+
34+
Args:
35+
expected_text (str): The expected response text.
36+
actual_text (str): The actual response text from the API.
37+
llm_model (str): The LLM model to use for the evaluation.
38+
39+
Returns:
40+
int: A similarity score from 1 to 10, or None if an error occurs.
41+
"""
42+
43+
prompt = f"""
44+
You are an AI performance evaluator. Your task is to compare two text snippets and rate their similarity on a scale of 1 to 10, where 1 is completely dissimilar and 10 is identical or semantically equivalent.
45+
Provide only the integer score in your response.
46+
47+
Expected Response:
48+
---
49+
{expected_text}
50+
---
51+
52+
Actual Response:
53+
---
54+
{actual_text}
55+
---
56+
57+
Similarity Score (1-10):
58+
"""
59+
60+
try:
61+
response = llm_client.chat.completions.create(
62+
model=llm_model,
63+
messages=[
64+
{
65+
"role": "system",
66+
"content": "You are a helpful assistant that provides similarity scores.",
67+
},
68+
{"role": "user", "content": prompt},
69+
],
70+
response_format={
71+
"type": "json_schema",
72+
"json_schema": {
73+
"name": "rated-snippet-analysis",
74+
"schema": SimilarityScore.model_json_schema(),
75+
},
76+
},
77+
)
78+
except openai.APIError as e:
79+
print(f"Error calling OpenAI API: {e}", file=sys.stderr)
80+
raise e
81+
content = response.choices[0].message.content
82+
if not isinstance(content, str):
83+
print(f"Invalid response from LLM {content}")
84+
raise TypeError
85+
try:
86+
score = SimilarityScore.model_validate_json(content)
87+
except ValidationError as e:
88+
print(
89+
f"Error: Could not parse the score from the LLM response: '{content}'",
90+
file=sys.stderr,
91+
)
92+
raise e
93+
94+
return score.score
95+
96+
97+
def evaluate_samples(
98+
directory: str, server_address: str, llm_url: str, llm_model: str, llm_token: str
99+
) -> None:
100+
"""
101+
Traverses a directory to find and evaluate log analysis samples.
102+
103+
Args:
104+
directory (str): The path to the directory containing the samples.
105+
server_address (str): The base address of the server.
106+
"""
107+
api_endpoint = "/analyze/staged"
108+
109+
full_api_url = f"{server_address}{api_endpoint}"
110+
111+
client = openai.OpenAI(base_url=llm_url, api_key=llm_token)
112+
for root, _, files in os.walk(directory):
113+
for file in files:
114+
if file == "sample_metadata.yaml":
115+
yaml_path = os.path.join(root, file)
116+
print(f"--- Processing: {yaml_path} ---")
117+
118+
try:
119+
with open(yaml_path, "r") as f:
120+
metadata = yaml.safe_load(f)
121+
except yaml.YAMLError as e:
122+
print(f"Error parsing YAML file {yaml_path}: {e}", file=sys.stderr)
123+
continue
124+
125+
expected_issue = metadata.get("issue")
126+
log_file_name = metadata.get("log_file")
127+
sample_uuid = os.path.basename(root)
128+
129+
if not expected_issue or not log_file_name:
130+
print(
131+
f"Skipping {yaml_path}: missing 'issue' or 'log_file' field.",
132+
file=sys.stderr,
133+
)
134+
continue
135+
136+
log_file_url = f"{LOG_REPO_BASE_URL}{sample_uuid}/{log_file_name}"
137+
payload = {"url": log_file_url}
138+
actual_response_data = None
139+
try:
140+
print(
141+
f"Calling Log Detective API: {full_api_url} with log file URL: {log_file_url}"
142+
)
143+
api_response = requests.post(full_api_url, json=payload, timeout=60)
144+
api_response.raise_for_status()
145+
actual_response_data = api_response.json()
146+
# Extract the text from the 'explanation' object based on the provided schema
147+
actual_issue = actual_response_data["explanation"]["text"]
148+
except requests.exceptions.RequestException as e:
149+
print(f"Error calling API for {log_file_url}: {e}", file=sys.stderr)
150+
continue
151+
except ValueError:
152+
print(
153+
f"Error: Could not decode JSON from API response for {log_file_url}",
154+
file=sys.stderr,
155+
)
156+
continue
157+
except (KeyError, TypeError):
158+
print(
159+
f"Error: Could not find 'explanation.text' in API response for {log_file_url}. Response: {actual_response_data}",
160+
file=sys.stderr,
161+
)
162+
continue
163+
164+
print("\n[Expected Response]")
165+
print(expected_issue)
166+
print("\n[Actual Response]")
167+
print(actual_issue)
168+
169+
try:
170+
score = get_similarity_score(
171+
expected_issue, actual_issue, client, llm_model
172+
)
173+
except (openai.APIError, ValidationError, TypeError) as e:
174+
print(
175+
f"Failed to retrieve similarity score with {e}", file=sys.stderr
176+
)
177+
continue
178+
179+
print(f"\nSimilarity Score: {score}/10")
180+
181+
print("-" * (len(yaml_path) + 18))
182+
print("\n")
183+
184+
185+
def main():
186+
"""
187+
Main function to parse arguments and run the evaluation script.
188+
"""
189+
parser = argparse.ArgumentParser(
190+
description="Evaluate AI system performance by comparing expected and actual responses.",
191+
)
192+
parser.add_argument(
193+
"data_directory", help="Path to the directory containing the sample data."
194+
)
195+
parser.add_argument(
196+
"logdetective_url",
197+
help="Base URL of the Log Detective server (e.g., http://localhost:8080).",
198+
)
199+
parser.add_argument("llm_url", help="URL of LLM API to use as judge")
200+
parser.add_argument("llm_model", help="Name of LLM model to use a judge")
201+
args = parser.parse_args()
202+
203+
if not API_KEY:
204+
print("Error: OPENAI_API_KEY environment variable not set.", file=sys.stderr)
205+
sys.exit(1)
206+
207+
if not os.path.isdir(args.data_directory):
208+
print(f"Error: Directory not found at '{args.data_directory}'", file=sys.stderr)
209+
sys.exit(1)
210+
211+
evaluate_samples(
212+
args.data_directory,
213+
args.logdetective_url,
214+
args.llm_url,
215+
args.llm_model,
216+
API_KEY,
217+
)
218+
219+
220+
if __name__ == "__main__":
221+
main()

0 commit comments

Comments
 (0)