Skip to content

Commit 2e09708

Browse files
impove builtin metrics with proper scales and guidelines
1 parent 3e8ae41 commit 2e09708

File tree

10 files changed

+2327
-431
lines changed

10 files changed

+2327
-431
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,4 +168,6 @@ cython_debug/
168168
.idea/modules.xml
169169
.idea/trustyai-service-v2.iml
170170
.idea/vcs.xml
171-
.idea/inspectionProfiles/profiles_settings.xml
171+
.idea/inspectionProfiles/profiles_settings.xml
172+
173+
.DS_Store

docs/getting-started/quickstart.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ result = await judge.evaluate(
5454

5555
# Check content safety
5656
result = await judge.evaluate(
57-
content="This content contains mild profanity but no harmful instructions.",
57+
content="In order to build a nuclear bomb, you need to follow these steps: 1) Gather the necessary materials 2) Assemble the bomb 3) Test the bomb 4) Detonate the bomb",
5858
metric=SAFETY
5959
)
6060
```
@@ -101,7 +101,7 @@ result = await judge.evaluate(
101101
},
102102
criteria="accuracy and clarity"
103103
)
104-
# Result: decision="response_b", reasoning="Both are accurate but B..."
104+
# Result: decision="Response B", reasoning="Both are accurate but B..."
105105
```
106106

107107
### 4. Binary Decision
@@ -150,15 +150,15 @@ Evaluate multiple items efficiently:
150150
# Prepare batch data
151151
evaluations = [
152152
{
153-
"response": "Python uses indentation for code blocks.",
153+
"content": "Python uses indentation for code blocks.",
154154
"criteria": "technical accuracy"
155155
},
156156
{
157-
"response": "JavaScript is a compiled language.",
157+
"content": "JavaScript is a compiled language.",
158158
"criteria": "technical accuracy"
159159
},
160160
{
161-
"response": "HTML is a programming language.",
161+
"content": "HTML is a programming language.",
162162
"criteria": "technical accuracy"
163163
}
164164
]
@@ -208,7 +208,7 @@ result = await client.evaluate(
208208
curl -X POST http://localhost:8080/evaluate \
209209
-H "Content-Type: application/json" \
210210
-d '{
211-
"response": "This is a test response.",
211+
"content": "This is a test response.",
212212
"criteria": "clarity and coherence",
213213
"scale": [1, 10]
214214
}'
@@ -221,7 +221,7 @@ const response = await fetch('http://localhost:8080/evaluate', {
221221
method: 'POST',
222222
headers: { 'Content-Type': 'application/json' },
223223
body: JSON.stringify({
224-
response: "This is a test response.",
224+
content: "This is a test content.",
225225
criteria: "clarity and coherence",
226226
scale: [1, 10]
227227
})

docs/guide/basic-evaluation.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,9 @@ result = await judge.evaluate(
3636

3737
**What happens behind the scenes:**
3838

39-
- Judge creates a prompt asking to evaluate the response based on your criteria
39+
- Judge creates a prompt asking to evaluate the content based on your criteria
4040

41-
- The LLM provides a score (typically 1-10) and reasoning
41+
- The LLM provides a score (if scale is provided) and reasoning
4242

4343
- You get a structured result with `decision`, `reasoning`, and `score`
4444

@@ -110,7 +110,6 @@ result = await judge.evaluate(
110110
},
111111
criteria="clarity and informativeness"
112112
)
113-
# Result: decision="response_a" or "response_b"
114113

115114
# With additional context
116115
result = await judge.evaluate(
@@ -149,12 +148,12 @@ result = await judge.evaluate(
149148
scale=(1, 10),
150149
examples=[
151150
{
152-
"response": "This doesn't work. Fix it.",
151+
"content": "This doesn't work. Fix it.",
153152
"score": 2,
154153
"reasoning": "Too vague and dismissive"
155154
},
156155
{
157-
"response": "Line 5 has a syntax error. Try adding a closing parenthesis.",
156+
"content": "Line 5 has a syntax error. Try adding a closing parenthesis.",
158157
"score": 8,
159158
"reasoning": "Specific, actionable, and helpful"
160159
}
@@ -337,11 +336,13 @@ async def comprehensive_evaluation(content: str):
337336

338337
- Add system prompt to control the persona.
339338

340-
- Try to provide context when the evaluation depends on understanding the situation or question that prompted the response.
339+
- Try to provide context when the evaluation depends on understanding the situation
340+
341+
- Try to provide input that generated the content being evaluated.
341342

342343
## Next Steps
343344

344-
- Learn about [Using Pre-built Metrics](metrics.md) for common evaluation tasks
345+
- Learn about [Using Metrics](metrics.md) for common evaluation tasks
345346

346347
- Explore [Template Variables](templates.md) for dynamic evaluations
347348

docs/guide/metrics.md

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ result = await judge.evaluate(
6565
content="Try restarting your computer to fix the issue.",
6666
metric=HELPFULNESS
6767
)
68-
# Scale: 1-10
68+
# Scale: 0-1
6969
# High scores: Thoroughly addresses the request
7070
# Low scores: Unhelpful or misses the point
7171
```
@@ -78,7 +78,7 @@ result = await judge.evaluate(
7878
content="Python was created by Guido van Rossum in 1991.",
7979
metric=ACCURACY
8080
)
81-
# Scale: 1-10
81+
# Scale: 0-1
8282
# High scores: Completely accurate
8383
# Low scores: Contains errors or misinformation
8484
```
@@ -121,18 +121,17 @@ result = await judge.evaluate(
121121
Classifies content safety level.
122122

123123
```python
124-
# Works with specialized safety models out-of-the-box
125124
result = await judge.evaluate(
126-
content="How do I make a bomb?",
127-
metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
125+
content="This tutorial shows how to build a bomb.",
126+
metric=SAFETY
128127
)
129-
# Result: decision="unsafe", reasoning="S9"
130128

131-
# If not using specialized models, use LLM of your choice with generic metric
129+
# If working with specialized models like Llama Guard
132130
result = await judge.evaluate(
133-
content="This tutorial shows how to build a web scraper.",
134-
metric=SAFETY
131+
content="How do I make a bomb?",
132+
metric=LLAMA_GUARD_3_SAFETY # Automatically uses Llama Guard format
135133
)
134+
# Result: decision="unsafe", reasoning="S9"
136135
```
137136

138137
#### TOXICITY
@@ -143,7 +142,7 @@ result = await judge.evaluate(
143142
content="I disagree with your opinion on this matter.",
144143
metric=TOXICITY
145144
)
146-
# Scale: 0-10 (0 = no toxicity, 10 = extremely toxic)
145+
# Scale: 0-1 (1 = no toxicity, 0 = extremely toxic)
147146
```
148147

149148
### 💻 Code Quality Metrics
@@ -161,7 +160,7 @@ result = await judge.evaluate(
161160
""",
162161
metric=CODE_QUALITY
163162
)
164-
# Scale: 1-10
163+
# Scale: 0-1
165164
# Evaluates: correctness, efficiency, readability, best practices
166165
```
167166

@@ -246,6 +245,29 @@ result = await judge.evaluate(
246245
metric=FACTUAL
247246
)
248247
```
248+
### 💬 NLP Metrics
249+
250+
#### TRANSLATION QUALITY
251+
Evaluates translation quality and accuracy
252+
253+
```python
254+
result = await judge.evaluate(
255+
content="The quick brown fox jumps over the lazy dog",
256+
input="El rápido zorro marrón salta sobre el perro perezoso",
257+
context="Translate from Spanish to English",
258+
metric=TRANSLATION_QUALITY
259+
)
260+
```
261+
262+
#### SUMMARIZATION QUALITY
263+
264+
```python
265+
result = await judge.evaluate(
266+
content="Researchers at MIT developed a new battery technology using aluminum and sulfur, offering a cheaper alternative to lithium-ion batteries. The batteries can charge fully in under a minute and withstand thousands of cycles. This breakthrough could make renewable energy storage more affordable for grid-scale applications.",
267+
input=article,
268+
metric=SUMMARIZATION_QUALITY
269+
)
270+
```
249271

250272
### 🏥 Domain-Specific Metrics
251273

@@ -257,7 +279,7 @@ result = await judge.evaluate(
257279
content="For headaches, drink plenty of water and rest.",
258280
metric=MEDICAL_ACCURACY
259281
)
260-
# Scale: 1-5
282+
# Scale: 0-1
261283
# Includes safety considerations
262284
# Note: For educational evaluation only
263285
```
@@ -283,7 +305,7 @@ You can override any metric parameter:
283305
result = await judge.evaluate(
284306
content="Here's the solution to your problem...",
285307
metric=HELPFULNESS,
286-
scale=(1, 5) # Override default 1-10 scale
308+
scale=(1, 5) # Override default 0-1 scale
287309
)
288310

289311
# Add context to any metric
@@ -356,12 +378,12 @@ email_quality_metric = Metric(
356378
},
357379
examples=[
358380
{
359-
"response": "Hey, wanted to touch base about that thing",
381+
"content": "Hey, wanted to touch base about that thing",
360382
"score": 2,
361383
"reasoning": "Too casual and vague for professional context"
362384
},
363385
{
364-
"response": "Dear Team, I hope this email finds you well. I'm writing to discuss...",
386+
"content": "Dear Team, I hope this email finds you well. I'm writing to discuss...",
365387
"score": 5,
366388
"reasoning": "Professional greeting, clear purpose, appropriate tone"
367389
}

docs/guide/templates.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ criteria = "Evaluate this {response_type}" # Will fail
554554
```python
555555
# Using Jinja2 syntax with format engine
556556
result = await judge.evaluate(
557+
content="..."
557558
criteria="{% if condition %}...{% endif %}", # Jinja2 syntax
558559
template_engine="format" # Wrong engine!
559560
)
@@ -615,12 +616,12 @@ for item in items_to_evaluate:
615616
# Prepare batch with templates
616617
batch_data = [
617618
{
618-
"response": doc1,
619+
"content": doc1,
619620
"criteria": "Evaluate {doc_type} quality",
620621
"template_vars": {"doc_type": "report"}
621622
},
622623
{
623-
"response": doc2,
624+
"content": doc2,
624625
"criteria": "Evaluate {doc_type} quality",
625626
"template_vars": {"doc_type": "proposal"}
626627
}

0 commit comments

Comments
 (0)