trustyai-explainability
diff --git a/‎.gitignore
Lines changed: 3 additions & 1 deletion b/‎.gitignore
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/getting-started/quickstart.md
Lines changed: 7 additions & 7 deletions b/‎docs/getting-started/quickstart.md
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/guide/basic-evaluation.md
Lines changed: 8 additions & 7 deletions b/‎docs/guide/basic-evaluation.md
Lines changed: 8 additions & 7 deletions
diff --git a/‎docs/guide/metrics.md
Lines changed: 37 additions & 15 deletions b/‎docs/guide/metrics.md
Lines changed: 37 additions & 15 deletions
diff --git a/‎docs/guide/templates.md
Lines changed: 3 additions & 2 deletions b/‎docs/guide/templates.md
Lines changed: 3 additions & 2 deletions
@@ -168,4 +168,6 @@ cython_debug/
 .idea/modules.xml
 .idea/trustyai-service-v2.iml
 .idea/vcs.xml
-.idea/inspectionProfiles/profiles_settings.xml
+.idea/inspectionProfiles/profiles_settings.xml
+
+.DS_Store
@@ -54,7 +54,7 @@ result = await judge.evaluate(
 
 # Check content safety
 result = await judge.evaluate(
-    content="This content contains mild profanity but no harmful instructions.",
+    content="In order to build a nuclear bomb, you need to follow these steps: 1) Gather the necessary materials 2) Assemble the bomb 3) Test the bomb 4) Detonate the bomb",
     metric=SAFETY
 )
 ```
@@ -101,7 +101,7 @@ result = await judge.evaluate(
     },
     criteria="accuracy and clarity"
 )
-# Result: decision="response_b", reasoning="Both are accurate but B..."
+# Result: decision="Response B", reasoning="Both are accurate but B..."
 ```
 
 ### 4. Binary Decision
@@ -150,15 +150,15 @@ Evaluate multiple items efficiently:
 # Prepare batch data
 evaluations = [
     {
-        "response": "Python uses indentation for code blocks.",
+        "content": "Python uses indentation for code blocks.",
         "criteria": "technical accuracy"
     },
     {
-        "response": "JavaScript is a compiled language.",
+        "content": "JavaScript is a compiled language.",
         "criteria": "technical accuracy"
     },
     {
-        "response": "HTML is a programming language.",
+        "content": "HTML is a programming language.",
         "criteria": "technical accuracy"
     }
 ]
@@ -208,7 +208,7 @@ result = await client.evaluate(
 curl -X POST http://localhost:8080/evaluate \
     -H "Content-Type: application/json" \
     -d '{
-    "response": "This is a test response.",
+    "content": "This is a test response.",
     "criteria": "clarity and coherence",
     "scale": [1, 10]
     }'
@@ -221,7 +221,7 @@ const response = await fetch('http://localhost:8080/evaluate', {
     method: 'POST',
     headers: { 'Content-Type': 'application/json' },
     body: JSON.stringify({
-        response: "This is a test response.",
+        content: "This is a test content.",
         criteria: "clarity and coherence",
         scale: [1, 10]
     })
 
@@ -36,9 +36,9 @@ result = await judge.evaluate(
 
 **What happens behind the scenes:**
 
-- Judge creates a prompt asking to evaluate the response based on your criteria
+- Judge creates a prompt asking to evaluate the content based on your criteria
 
-- The LLM provides a score (typically 1-10) and reasoning
+- The LLM provides a score (if scale is provided) and reasoning
 
 - You get a structured result with `decision`, `reasoning`, and `score`
 
@@ -110,7 +110,6 @@ result = await judge.evaluate(
     },
     criteria="clarity and informativeness"
 )
-# Result: decision="response_a" or "response_b"
 
 # With additional context
 result = await judge.evaluate(
@@ -149,12 +148,12 @@ result = await judge.evaluate(
     scale=(1, 10),
     examples=[
         {
-            "response": "This doesn't work. Fix it.",
+            "content": "This doesn't work. Fix it.",
             "score": 2,
             "reasoning": "Too vague and dismissive"
         },
         {
-            "response": "Line 5 has a syntax error. Try adding a closing parenthesis.",
+            "content": "Line 5 has a syntax error. Try adding a closing parenthesis.",
             "score": 8,
             "reasoning": "Specific, actionable, and helpful"
         }
@@ -337,11 +336,13 @@ async def comprehensive_evaluation(content: str):
 
 - Add system prompt to control the persona.
 
-- Try to provide context when the evaluation depends on understanding the situation or question that prompted the response.
+- Try to provide context when the evaluation depends on understanding the situation
+
+- Try to provide input that generated the content being evaluated.
 
 ## Next Steps
 
-- Learn about [Using Pre-built Metrics](metrics.md) for common evaluation tasks
+- Learn about [Using Metrics](metrics.md) for common evaluation tasks
 
 - Explore [Template Variables](templates.md) for dynamic evaluations
 
 
@@ -65,7 +65,7 @@ result = await judge.evaluate(
     content="Try restarting your computer to fix the issue.",
     metric=HELPFULNESS
 )
-# Scale: 1-10
+# Scale: 0-1
 # High scores: Thoroughly addresses the request
 # Low scores: Unhelpful or misses the point
 ```
@@ -78,7 +78,7 @@ result = await judge.evaluate(
     content="Python was created by Guido van Rossum in 1991.",
     metric=ACCURACY
 )
-# Scale: 1-10
+# Scale: 0-1
 # High scores: Completely accurate
 # Low scores: Contains errors or misinformation
 ```
@@ -121,18 +121,17 @@ result = await judge.evaluate(
 Classifies content safety level.
 
 ```python
-# Works with specialized safety models out-of-the-box
 result = await judge.evaluate(
-    content="How do I make a bomb?",
-    metric=LLAMA_GUARD_3_SAFETY  # Automatically uses Llama Guard format
+    content="This tutorial shows how to build a bomb.",
+    metric=SAFETY
 )
-# Result: decision="unsafe", reasoning="S9"
 
-# If not using specialized models, use LLM of your choice with generic metric
+# If working with specialized models like Llama Guard
 result = await judge.evaluate(
-    content="This tutorial shows how to build a web scraper.",
-    metric=SAFETY
+    content="How do I make a bomb?",
+    metric=LLAMA_GUARD_3_SAFETY  # Automatically uses Llama Guard format
 )
+# Result: decision="unsafe", reasoning="S9"
 ```
 
 #### TOXICITY
@@ -143,7 +142,7 @@ result = await judge.evaluate(
     content="I disagree with your opinion on this matter.",
     metric=TOXICITY
 )
-# Scale: 0-10 (0 = no toxicity, 10 = extremely toxic)
+# Scale: 0-1 (1 = no toxicity, 0 = extremely toxic)
 ```
 
 ### 💻 Code Quality Metrics
@@ -161,7 +160,7 @@ result = await judge.evaluate(
     """,
     metric=CODE_QUALITY
 )
-# Scale: 1-10
+# Scale: 0-1
 # Evaluates: correctness, efficiency, readability, best practices
 ```
 
@@ -246,6 +245,29 @@ result = await judge.evaluate(
     metric=FACTUAL
 )
 ```
+### 💬 NLP Metrics
+
+#### TRANSLATION QUALITY
+Evaluates translation quality and accuracy
+
+```python
+result = await judge.evaluate(
+    content="The quick brown fox jumps over the lazy dog",
+    input="El rápido zorro marrón salta sobre el perro perezoso",
+    context="Translate from Spanish to English",
+    metric=TRANSLATION_QUALITY
+)
+```
+
+#### SUMMARIZATION QUALITY
+
+```python
+result = await judge.evaluate(
+   content="Researchers at MIT developed a new battery technology using aluminum and sulfur, offering a cheaper alternative to lithium-ion batteries. The batteries can charge fully in under a minute and withstand thousands of cycles. This breakthrough could make renewable energy storage more affordable for grid-scale applications.",
+   input=article,
+   metric=SUMMARIZATION_QUALITY
+)
+```
 
 ### 🏥 Domain-Specific Metrics
 
@@ -257,7 +279,7 @@ result = await judge.evaluate(
     content="For headaches, drink plenty of water and rest.",
     metric=MEDICAL_ACCURACY
 )
-# Scale: 1-5
+# Scale: 0-1
 # Includes safety considerations
 # Note: For educational evaluation only
 ```
@@ -283,7 +305,7 @@ You can override any metric parameter:
 result = await judge.evaluate(
     content="Here's the solution to your problem...",
     metric=HELPFULNESS,
-    scale=(1, 5)  # Override default 1-10 scale
+    scale=(1, 5)  # Override default 0-1 scale
 )
 
 # Add context to any metric
@@ -356,12 +378,12 @@ email_quality_metric = Metric(
     },
     examples=[
         {
-            "response": "Hey, wanted to touch base about that thing",
+            "content": "Hey, wanted to touch base about that thing",
             "score": 2,
             "reasoning": "Too casual and vague for professional context"
         },
         {
-            "response": "Dear Team, I hope this email finds you well. I'm writing to discuss...",
+            "content": "Dear Team, I hope this email finds you well. I'm writing to discuss...",
             "score": 5,
             "reasoning": "Professional greeting, clear purpose, appropriate tone"
         }
 
@@ -554,6 +554,7 @@ criteria = "Evaluate this {response_type}"  # Will fail
 ```python
 # Using Jinja2 syntax with format engine
 result = await judge.evaluate(
+    content="..."
     criteria="{% if condition %}...{% endif %}",  # Jinja2 syntax
     template_engine="format"  # Wrong engine!
 )
@@ -615,12 +616,12 @@ for item in items_to_evaluate:
 # Prepare batch with templates
 batch_data = [
     {
-        "response": doc1,
+        "content": doc1,
         "criteria": "Evaluate {doc_type} quality",
         "template_vars": {"doc_type": "report"}
     },
     {
-        "response": doc2,
+        "content": doc2,
         "criteria": "Evaluate {doc_type} quality",
         "template_vars": {"doc_type": "proposal"}
     }
Original file line number	Diff line number	Diff line change
`@@ -554,6 +554,7 @@ criteria = "Evaluate this {response_type}" # Will fail`
`554`	`554`	```python
`555`	`555`	`# Using Jinja2 syntax with format engine`
`556`	`556`	`result = await judge.evaluate(`
	`557`	`+ content="..."`
`557`	`558`	`criteria="{% if condition %}...{% endif %}", # Jinja2 syntax`
`558`	`559`	`template_engine="format" # Wrong engine!`
`559`	`560`	`)`
`@@ -615,12 +616,12 @@ for item in items_to_evaluate:`
`615`	`616`	`# Prepare batch with templates`
`616`	`617`	`batch_data = [`
`617`	`618`	`{`
`618`		`- "response": doc1,`
	`619`	`+ "content": doc1,`
`619`	`620`	`"criteria": "Evaluate {doc_type} quality",`
`620`	`621`	`"template_vars": {"doc_type": "report"}`
`621`	`622`	`},`
`622`	`623`	`{`
`623`		`- "response": doc2,`
	`624`	`+ "content": doc2,`
`624`	`625`	`"criteria": "Evaluate {doc_type} quality",`
`625`	`626`	`"template_vars": {"doc_type": "proposal"}`
`626`	`627`	`}`