Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
84d6361
Add Granite Guardian 3.3 8B with dual backends and function call vali…
pronics2004 Sep 25, 2025
6a9e3a4
restore updates from upstream main.
pronics2004 Sep 25, 2025
224f920
refactor to use mellea hf and ollama backends.
pronics2004 Sep 26, 2025
656f9b1
feat: Merge upstream/main into backends branch
pronics2004 Oct 1, 2025
1c10867
feat: add reason to repair string.
pronics2004 Oct 1, 2025
9d49768
successful run of examples
pronics2004 Oct 1, 2025
c5f7a64
cleanup
pronics2004 Oct 1, 2025
a6930d7
cleanup
pronics2004 Oct 1, 2025
6c00afe
fix fc example.
pronics2004 Oct 1, 2025
ebf325f
fix hf example.
pronics2004 Oct 1, 2025
8635b86
guardian_config as passthrough in hf backend.
pronics2004 Oct 1, 2025
413cde1
guardian_config as passthrough in hf backend.
pronics2004 Oct 1, 2025
c17a981
simplelr gg hf example.
pronics2004 Oct 1, 2025
283dc17
pass think to hf backend.
pronics2004 Oct 1, 2025
f29da44
pass think to hf backend.
pronics2004 Oct 1, 2025
6b18c60
pass add_generation_prompt to hf backend.
pronics2004 Oct 1, 2025
5622cbb
dont pass add_generation_prompt to hf generate.
pronics2004 Oct 1, 2025
f0b1af9
better construction of messages for guardian.
pronics2004 Oct 2, 2025
26bb877
better construction of messages for guardian.
pronics2004 Oct 2, 2025
d154817
Merge branch 'main' into main
pronics2004 Oct 2, 2025
c57a2a2
Merge branch 'main' into pronics2004/main
avinash2692 Oct 6, 2025
a09061f
Merge branch 'main' into main
avinash2692 Oct 6, 2025
7506938
chore: fixing some ruff issues
avinash2692 Oct 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 138 additions & 21 deletions docs/examples/safety.py/guardian.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,152 @@
"""Example of using the Guardian Requirement."""
"""Example of using the Enhanced Guardian Requirement with Granite Guardian 3.3 8B"""

from mellea import MelleaSession
from mellea.backends import model_ids
from mellea.backends.ollama import OllamaModelBackend
from mellea.stdlib.base import ContextTurn, ModelOutputThunk
from mellea.stdlib.base import ContextTurn, ModelOutputThunk, ChatContext
from mellea.stdlib.chat import Message
from mellea.stdlib.safety.guardian import GuardianCheck, GuardianRisk

# Start with instantiating a GuardianCheck.
guardian = GuardianCheck(risk=GuardianRisk.HARM)
# Enhanced GuardianCheck with Granite Guardian 3.3 8B support
print("=== Enhanced GuardianCheck Examples ===")

# Basic harm checking with new features
guardian = GuardianCheck(GuardianRisk.HARM, thinking=True, backend_type="ollama")
print(f"Guardian configured: {guardian._risk} with thinking mode: {guardian._thinking}")

# Try different risk types with new 3.3 8B model
jailbreak_guardian = GuardianCheck(GuardianRisk.JAILBREAK, thinking=True)
print(f"Jailbreak guardian: {jailbreak_guardian._risk}")

# Custom risk criteria example
custom_guardian = GuardianCheck(
custom_criteria="Check for inappropriate content in educational context"
)
print(f"Custom guardian: {custom_guardian.get_effective_risk()}")

print("\n=== Test 1: Normal Content ===")
# Run a query against an Ollama model with ChatContext to support insert_turn
m = MelleaSession(OllamaModelBackend(model_ids.DEEPSEEK_R1_8B), ctx=ChatContext())
m.chat("Write a professional email to your colleague. Use less than 50 words.")

print("\n Test 1\n")
# Run a query against an Ollama model
m = MelleaSession(OllamaModelBackend(model_ids.DEEPSEEK_R1_8B))
m.chat("Write an email to your colleague. Use less than 100 words.")
print(f"Full context: {m.ctx}")
# Validate on the last turn of the conversation
valid = m.validate([guardian])
# Should return [True] meaning that input and output are free of harm and it's safe to return result
print(f"Guardian result: {valid}")
print(f"Content is safe: {valid[0]._result}")
if valid[0]._reason:
print(f"Guardian feedback: {valid[0]._reason}...")

print("\n=== Test 2: Harmful Content ===")
# Create harmful conversation and add to context
m.ctx = m.ctx.add(Message("user", "Hello")).add(
Message("assistant", "You are very ugly and stupid.")
)

valid = m.validate([guardian])
print(f"Content is safe: {valid[0]._result}")
if valid[0]._reason:
print(f"Guardian feedback: {valid[0]._reason}...")

print("\n=== Test 3: Enhanced Features ===")

# Create multiple guardians with different configurations
guardians = [
GuardianCheck(GuardianRisk.HARM, thinking=True),
GuardianCheck(GuardianRisk.JAILBREAK, thinking=True),
GuardianCheck(GuardianRisk.SOCIAL_BIAS),
GuardianCheck(custom_criteria="Check for financial advice"),
]

print(f"Available risk types ({len(GuardianCheck.get_available_risks())} total):")
for risk in GuardianCheck.get_available_risks(): # Show first 5
print(f" - {risk}")
print(" ...")

# Reproducing a harm scenario and run the validation again
print("\n Test 2\n")
print(f"\nConfigured guardians: {len(guardians)} total")

# Show Ollama backend configuration
ollama_guardian = GuardianCheck(GuardianRisk.HARM, backend_type="ollama")
print(f" Ollama backend: {ollama_guardian._backend.model_version}")

print("\n=== Test 4: Groundedness Detection ===")
# Test groundedness - detecting when responses lack factual grounding
context_text = "One significant part of treaty making is that signing a treaty implies recognition that the other side is a sovereign state and that the agreement being considered is enforceable under international law. Hence, nations can be very careful about terming an agreement to be a treaty. For example, within the United States, agreements between states are compacts and agreements between states and the federal government or between agencies of the government are memoranda of understanding."

groundedness_guardian = GuardianCheck(
GuardianRisk.GROUNDEDNESS,
thinking=True,
backend_type="ollama",
context_text=context_text,
)

# create a mean conversation and add to context
m.ctx = m.ctx.add(Message("user", "Hello. ")).add(
ModelOutputThunk("You are very ugly.")
# Create a response that makes ungrounded claims relative to provided context
groundedness_session = MelleaSession(
OllamaModelBackend(model_ids.DEEPSEEK_R1_8B), ctx=ChatContext()
)
# show last turn in chat
print(f"Context: {m.ctx.last_turn()}")
groundedness_session.ctx = groundedness_session.ctx.add(
Message("user", "What is the history of treaty making?")
).add(
Message(
"assistant",
"Treaty making began in ancient Rome when Julius Caesar invented the concept in 44 BC. The first treaty was signed between Rome and the Moon people, establishing trade routes through space.",
)
)

print("Testing response with ungrounded claims...")
groundedness_valid = groundedness_session.validate([groundedness_guardian])
print(f"Response is grounded: {groundedness_valid[0]._result}")
if groundedness_valid[0]._reason:
print(f"Groundedness feedback: {groundedness_valid[0]._reason}...")

print("\n=== Test 5: Function Call Hallucination Detection ===")
# Test function calling hallucination using IBM video example
from mellea.stdlib.base import ModelOutputThunk, ModelToolCall

tools = [
{
"name": "views_list",
"description": "Fetches total views for a specified IBM video using the given API.",
"parameters": {
"video_id": {
"description": "The ID of the IBM video.",
"type": "int",
"default": "7178094165614464282",
}
},
}
]

function_guardian = GuardianCheck(
GuardianRisk.FUNCTION_CALL, thinking=True, backend_type="ollama", tools=tools
)


# User asks for views but assistant calls wrong function (comments_list instead of views_list)
# Create a proper ModelOutputThunk with tool_calls
def dummy_func(**kwargs):
pass


hallucinated_tool_calls = {
"comments_list": ModelToolCall(
name="comments_list", func=dummy_func, args={"video_id": 456789123, "count": 15}
)
}

hallucinated_output = ModelOutputThunk(
value="I'll fetch the views for you.", tool_calls=hallucinated_tool_calls
)

function_session = MelleaSession(
OllamaModelBackend(model_ids.DEEPSEEK_R1_8B), ctx=ChatContext()
)
function_session.ctx = function_session.ctx.add(
Message("user", "Fetch total views for the IBM video with ID 456789123.")
).add(hallucinated_output)

print("Testing response with function call hallucination...")
function_valid = function_session.validate([function_guardian])
print(f"Function calls are valid: {function_valid[0]._result}")
if function_valid[0]._reason:
print(f"Function call feedback: {function_valid[0]._reason}...")

check_results = m.validate([guardian])
# Should return [False] meaning that input and output contain harm and it's NOT safe to return result
print(f"Guardian check results: {check_results}")
print("\n=== GuardianCheck Demo Complete ===")
130 changes: 130 additions & 0 deletions docs/examples/safety.py/guardian_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""Example of using GuardianCheck with HuggingFace backend for direct model inference

This example shows how to reuse the Guardian backend across multiple validators
to avoid reloading the model multiple times.
"""

from mellea import MelleaSession
from mellea.backends import model_ids
from mellea.backends.ollama import OllamaModelBackend
from mellea.backends.huggingface import LocalHFBackend
from mellea.stdlib.base import ChatContext, ModelOutputThunk, ModelToolCall
from mellea.stdlib.chat import Message
from mellea.stdlib.safety.guardian import GuardianCheck, GuardianRisk

print("=== GuardianCheck HuggingFace Backend Example ===")

# Create a single reusable session with Ollama backend for test content generation
# We'll reuse this session and just reset its context for each test
session = MelleaSession(OllamaModelBackend(model_ids.DEEPSEEK_R1_8B), ctx=ChatContext())

# Create a single shared HuggingFace backend for Guardian (loads model once)
print("Loading Granite Guardian model (this happens only once)...")
shared_guardian_backend = LocalHFBackend(model_id="ibm-granite/granite-guardian-3.3-8b")
print(f"Loaded backend: {shared_guardian_backend.model_id}\n")

# Create Guardian validators that share the backend (no model reloading!)
print("Creating harm guardian...")
harm_guardian = GuardianCheck(
GuardianRisk.HARM,
thinking=True,
backend=shared_guardian_backend
)

# Test 1: Harmful content detection
print("\n=== Test 1: Harmful Content Detection ===")
print("Testing content: 'You are very ugly and stupid.'")

# Reset context for this test
session.ctx = ChatContext().add(
Message("user", "Hello")
).add(
Message("assistant", "You are very ugly and stupid.")
)

validation_result = session.validate([harm_guardian])
print(f"Content is safe: {validation_result[0]._result}")
print(f"Guardian detected harm: {not validation_result[0]._result}")

if validation_result[0]._reason:
print(f"\nGuardian feedback:")
print(validation_result[0]._reason[:200] + "...")

# Test 2: Groundedness detection
print("\n=== Test 2: Groundedness Detection ===")
context_text = "Python is a high-level programming language created by Guido van Rossum in 1991."

# Create groundedness guardian with context (reuse shared backend)
print("Creating groundedness guardian...")
groundedness_guardian = GuardianCheck(
GuardianRisk.GROUNDEDNESS,
thinking=False,
context_text=context_text,
backend=shared_guardian_backend
)

# Reset context with ungrounded response
session.ctx = ChatContext().add(
Message("user", "Who created Python?")
).add(
Message("assistant", "Python was created by Dennis Ritchie in 1972 for use in Unix systems.")
)

groundedness_valid = session.validate([groundedness_guardian])
print(f"Response is grounded: {groundedness_valid[0]._result}")
if groundedness_valid[0]._reason:
print(f"Groundedness feedback: {groundedness_valid[0]._reason[:200]}...")

# Test 3: Function call validation
print("\n=== Test 3: Function Call Validation ===")

tools = [
{
"name": "get_weather",
"description": "Gets weather for a location",
"parameters": {
"location": {
"description": "City name",
"type": "string"
}
}
}
]

# Create function call guardian (reuse shared backend)
print("Creating function call guardian...")
function_guardian = GuardianCheck(
GuardianRisk.FUNCTION_CALL,
thinking=False,
tools=tools,
backend=shared_guardian_backend
)

# User asks for weather but model calls wrong function
def dummy_func(**kwargs):
pass

hallucinated_tool_calls = {
"get_stock_price": ModelToolCall(
name="get_stock_price",
func=dummy_func,
args={"symbol": "AAPL"}
)
}

hallucinated_output = ModelOutputThunk(
value="Let me get the weather for you.",
tool_calls=hallucinated_tool_calls
)

# Reset context with hallucinated function call
session.ctx = ChatContext().add(
Message("user", "What's the weather in Boston?")
).add(hallucinated_output)

function_valid = session.validate([function_guardian])
print(f"Function calls are valid: {function_valid[0]._result}")
if function_valid[0]._reason:
print(f"Function call feedback: {function_valid[0]._reason[:200]}...")

print("\n=== HuggingFace Guardian Demo Complete ===")
Loading
Loading