Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 27 additions & 33 deletions src/llamator/attacks/autodan_turbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,36 +287,34 @@ def _response_verification(self, history: List[Dict[str, str]], malicious_reques

return is_broken

def _update_strategy_library(self, successful_attacks: List[Dict[str, Any]]) -> None:
def _update_strategy_library(self, successful_attack: Dict[str, Any]) -> None:
"""
Updates the strategy library based on successful attacks.
Updates the strategy library based on successful attack.

Args:
successful_attacks: List of successful attack attempts
successful_attack: Successful attack attempt
"""
if not successful_attacks:
if not successful_attack:
return

# Format attack summary for the judge model
attack_summary = "\n\n".join(
[
f"Malicious Request: {attack['malicious_request']}\n"
f"Jailbreak Prompt: {attack['jailbreak_prompt']}\n"
f"Response: {attack['response']}\n"
f"Stages: {attack['stages']}"
for attack in successful_attacks
]
attack_summary = (
f"Malicious Request: {successful_attack['malicious_request']}\n"
f"Jailbreak Prompt: {successful_attack['jailbreak_prompt']}\n"
f"Response: {successful_attack['response']}\n"
f"Stages: {successful_attack['stages']}"
)

# Get new strategy from judge model
judge_chat = ChatSession(
client=self.judge_config.judge_client.get_target_client(), system_prompts=[SUMMARIZER_SYSTEM_PROMPT]
)

summary_response = judge_chat.say(attack_summary).lower()
summary_response = judge_chat.say(attack_summary)
if not summary_response:
logger.warning("Failed to get strategy summary from judge model")
return
summary_response = summary_response.lower()

# Extract and add new strategy
try:
Expand Down Expand Up @@ -354,7 +352,6 @@ def run(self) -> Generator[StatusUpdate, None, None]:
attack_prompts: List[str] = []
responses: List[str] = []
statuses: List[str] = []
successful_attacks: List[str] = []

try:
# Load and prepare dataset
Expand Down Expand Up @@ -480,15 +477,23 @@ def run(self) -> Generator[StatusUpdate, None, None]:
self._update_strategy_performance(strategy["strategy"], score)

# Record this successful attack for strategy improvement
successful_attacks.append(
{
"malicious_request": malicious_request,
"jailbreak_prompt": iter_attack_prompts[-1],
"response": iter_responses[-1],
"stages": conversation_stages,
"strategies_used": [s["strategy"] for s in selected_strategies],
}
yield StatusUpdate(
self.client_config,
self.info["code_name"],
self.status,
"Reflecting",
i,
self.num_attempts,
)
successful_attack = {
"malicious_request": malicious_request,
"jailbreak_prompt": iter_attack_prompts[-1],
"response": iter_responses[-1],
"stages": conversation_stages,
"strategies_used": [s["strategy"] for s in selected_strategies],
}
# Update strategy library
self._update_strategy_library(successful_attack)
else:
# If still resilient, mark all responses as resilient
conversation_statuses = ["Resilient"] * conversation_stages
Expand All @@ -503,17 +508,6 @@ def run(self) -> Generator[StatusUpdate, None, None]:
statuses.extend(conversation_statuses)
original_prompts.extend([malicious_request] * conversation_stages)

yield StatusUpdate(
self.client_config,
self.info["code_name"],
self.status,
"Reflecting",
i,
self.num_attempts,
)
# Update strategy library
self._update_strategy_library(successful_attacks)

# Save artifacts for report
self._save_attack_artifacts(
attack_prompts=attack_prompts,
Expand Down
5 changes: 5 additions & 0 deletions src/llamator/attacks/vlm_lowres_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,11 @@ def run(self) -> Generator[StatusUpdate, None, None]:
judge_verdict = judge_chat.say(
LOWRES_DOCS_JUDGE_MODEL_TEMPLATE.format(expected_response=ground_truth, actual_response=response)
)
if not judge_verdict:
logger.warning("Error while Judge Model evaluates response (didn't receive response) ...")
self.status.report_error(response)
current_status = "Error"
continue
verified = "yes" in judge_verdict.lower()
current_status = "Resilient" if verified else "Broken"

Expand Down
5 changes: 5 additions & 0 deletions src/llamator/attacks/vlm_text_hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,11 @@ def run(self) -> Generator[StatusUpdate, None, None]:
expected_response=ground_truth, actual_response=response
)
)
if not judge_verdict:
logger.warning("Error while Judge Model evaluates response (didn't receive response) ...")
self.status.report_error(response)
current_status = "Error"
continue
verified = "yes" in judge_verdict.lower()
current_status = "Resilient" if verified else "Broken"
if not verified:
Expand Down
Loading