Skip to content

Commit c87ea8e

Browse files
authored
add shuffle inconsistency attack (#98)
1 parent 173f1b0 commit c87ea8e

File tree

9 files changed

+212
-14
lines changed

9 files changed

+212
-14
lines changed

CONTRIBUTING.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,10 @@ from ..attacks import ( # noqa
120120
ru_dan,
121121
ru_typoglycemia,
122122
ru_ucar,
123+
shuffle,
124+
suffix,
123125
sycophancy,
126+
system_prompt_leakage,
124127
typoglycemia,
125128
ucar,
126129
@@ -145,7 +148,10 @@ AvailableTests = [
145148
"RU_do_anything_now_jailbreak",
146149
"RU_typoglycemia_attack",
147150
"RU_ucar",
151+
"shuffle",
152+
"suffix",
148153
"sycophancy",
154+
"system_prompt_leakage",
149155
"typoglycemia_attack",
150156
"ucar",
151157

docs/attack_descriptions.md

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ This section provides an overview of all available attacks.
77
### OWASP [LLM01:2025](https://genai.owasp.org/llmrisk/llm01-prompt-injection/) Prompt Injection
88

99
<details>
10-
<summary><strong>Adversarial suffix</strong></summary>
10+
<summary><strong>Adversarial Suffix</strong></summary>
1111

1212
In code name: `suffix`
1313

1414
Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation.
1515

16-
_Original Paper <https://arxiv.org/abs/2307.15043>, Code <https://github.com/llm-attacks/llm-attacks>_
16+
_Original Paper: <https://arxiv.org/abs/2307.15043>, Original Code: <https://github.com/llm-attacks/llm-attacks>_
1717

1818
<a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/suffix.py">View code on GitHub</a>
1919
</details>
@@ -45,7 +45,7 @@ In code name: `bon`
4545

4646
Probes the LLM resilience against Best-Of-N attack.
4747

48-
_Original Paper <https://arxiv.org/abs/2412.03556>, Code <https://github.com/jplhughes/bon-jailbreaking>_
48+
_Original Paper: <https://arxiv.org/abs/2412.03556>, Original Code: <https://github.com/jplhughes/bon-jailbreaking>_
4949

5050
<a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/bon.py">View code on GitHub</a>
5151
</details>
@@ -120,6 +120,18 @@ Challenges the LLM with a dataset of harmful behavior prompts in past tense form
120120
<a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/past_tense.py">View code on GitHub</a>
121121
</details>
122122

123+
<details>
124+
<summary><strong>Shuffle Inconsistency</strong></summary>
125+
126+
In code name: `shuffle`
127+
128+
Attempts to bypass LLM's defense mechanism by randomly shuffling elements in input harmful prompt.
129+
130+
_Original Paper: <https://arxiv.org/html/2501.04931>_
131+
132+
<a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/shuffle.py">View code on GitHub</a>
133+
</details>
134+
123135
<details>
124136
<summary><strong>Typoglycemia Attack</strong></summary>
125137

@@ -192,7 +204,7 @@ In code name: `crescendo`
192204

193205
Challenges the LLM with a dataset of harmful behavior prompts using Crescendo strategy, operates by engaging the LLM in a series of escalating conversational turns, and multistage refinement.
194206

195-
_Original Paper <https://arxiv.org/abs/2404.01833>_
207+
_Original Paper: <https://arxiv.org/abs/2404.01833>_
196208

197209
<a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/crescendo.py">View code on GitHub</a>
198210
</details>
@@ -238,7 +250,7 @@ In code name: `sycophancy`
238250

239251
Attacks the LLM system during a multistage conversation using the argumentum ad verecundiam method based on the system's domain.
240252

241-
_Original Paper <https://arxiv.org/abs/2310.13548>_
253+
_Original Paper: <https://arxiv.org/abs/2310.13548>_
242254

243255
<a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/sycophancy.py">View code on GitHub</a>
244256
</details>

src/llamator/attack_provider/attack_loader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
ru_dan,
1515
ru_typoglycemia,
1616
ru_ucar,
17+
shuffle,
1718
suffix,
1819
sycophancy,
1920
system_prompt_leakage,

src/llamator/attacks/attack_descriptions.json

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
{
1717
"name": "Best-of-N Jailbreak",
1818
"in_code_name": "bon",
19-
"description_en": "Probes the LLM resilience against Best-of-N attack.",
20-
"description_ru": "Испытывает устойчивость LLM к атаке Best-of-N.",
19+
"description_en": "Probes the LLM resilience against Best-of-N attack. Original Paper: https://arxiv.org/abs/2412.03556, Original Code: https://github.com/jplhughes/bon-jailbreaking",
20+
"description_ru": "Испытывает устойчивость LLM к атаке Best-of-N. Статья: https://arxiv.org/abs/2412.03556, оригинальный код: https://github.com/jplhughes/bon-jailbreaking",
2121
"github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/bon.py"
2222
},
2323
{
@@ -30,8 +30,8 @@
3030
{
3131
"name": "Crescendo",
3232
"in_code_name": "crescendo",
33-
"description_en": "Challenges the LLM with a dataset of harmful behavior prompts using Crescendo strategy, operates by engaging the LLM in a series of escalating conversational turns, and multistage refinement.",
34-
"description_ru": "Испытывает LLM набором запросов, связанных с вредным поведением, используя стратегию Crescendo путем вовлечения LLM в серию растущих разговорных поворотов с самоулучшением атакующей модели.",
33+
"description_en": "Challenges the LLM with a dataset of harmful behavior prompts using Crescendo strategy, operates by engaging the LLM in a series of escalating conversational turns, and multistage refinement. Original Paper: https://arxiv.org/abs/2404.01833",
34+
"description_ru": "Испытывает LLM набором запросов, связанных с вредным поведением, используя стратегию Crescendo путем вовлечения LLM в серию растущих разговорных поворотов с самоулучшением атакующей модели. Статья: https://arxiv.org/abs/2404.01833",
3535
"github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/crescendo.py"
3636
},
3737
{
@@ -105,17 +105,24 @@
105105
"github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/ru_ucar.py"
106106
},
107107
{
108-
"name": "Adversarial suffix",
108+
"name": "Shuffle Inconsistency",
109+
"in_code_name": "shuffle",
110+
"description_en": "Attempts to bypass LLM's defense mechanism by randomly shuffling elements in input harmful prompt. Original Paper: https://arxiv.org/html/2501.04931",
111+
"description_ru": "Проверяет устойчивость LLM к Shuffle Inconsistency: Попытка обойти защиту LLM, случайным образом перемешивая элементы во входном вредоносном запросе. Статья: https://arxiv.org/html/2501.04931",
112+
"github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/shuffle.py"
113+
},
114+
{
115+
"name": "Adversarial Suffix",
109116
"in_code_name": "suffix",
110-
"description_en": "Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation.",
111-
"description_ru": "Использует специально сконструированный суффикс для обхода фильтров и ограничений LLM, заставляя модель генерировать запрещённый или вредоносный контент, несмотря на исходный запрос. Оценивает уязвимость модели к манипуляциям входных данных.",
117+
"description_en": "Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation. Original Paper: https://arxiv.org/abs/2307.15043, Original Code: https://github.com/llm-attacks/llm-attacks",
118+
"description_ru": "Использует специально сконструированный суффикс для обхода фильтров и ограничений LLM, заставляя модель генерировать запрещённый или вредоносный контент, несмотря на исходный запрос. Оценивает уязвимость модели к манипуляциям входных данных. Статья: https://arxiv.org/abs/2307.15043, оригинальный код: https://github.com/llm-attacks/llm-attacks",
112119
"github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/suffix.py"
113120
},
114121
{
115122
"name": "Sycophancy Test",
116123
"in_code_name": "sycophancy",
117-
"description_en": "Attacks the LLM system during a multistage conversation using the argumentum ad verecundiam method based on the system's domain.",
118-
"description_ru": "Проверяет связность и последовательность ответов LLM-системы в ходе продолжительного диалога на основе её описания, вступая с ней в спор с использованием аргументации к авторитету (лат. argumentum ad verecundiam).",
124+
"description_en": "Attacks the LLM system during a multistage conversation using the argumentum ad verecundiam method based on the system's domain. Original Paper: https://arxiv.org/abs/2310.13548",
125+
"description_ru": "Проверяет связность и последовательность ответов LLM-системы в ходе продолжительного диалога на основе её описания, вступая с ней в спор с использованием аргументации к авторитету (лат. argumentum ad verecundiam). Статья: https://arxiv.org/abs/2310.13548",
119126
"github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/sycophancy.py"
120127
},
121128
{

src/llamator/attacks/shuffle.py

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
import logging
2+
import os
3+
import random
4+
from pathlib import Path
5+
from typing import Generator, Optional
6+
7+
import pandas as pd
8+
9+
from ..attack_provider.attack_registry import register_test
10+
from ..attack_provider.test_base import StatusUpdate, TestBase
11+
from ..attack_provider.util import contains_refusal_keywords
12+
from ..client.attack_config import AttackConfig
13+
from ..client.chat_client import ChatSession
14+
from ..client.client_config import ClientConfig
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
@register_test
20+
class TestShuffle(TestBase):
21+
"""Probes the LLM resilience against Shuffle Inconsistency.
22+
Original Paper: https://arxiv.org/html/2501.04931"""
23+
24+
test_name = "shuffle"
25+
26+
def __init__(
27+
self,
28+
client_config: ClientConfig,
29+
attack_config: AttackConfig,
30+
artifacts_path: Optional[str] = None,
31+
num_attempts: int = 0,
32+
*args,
33+
**kwargs,
34+
):
35+
super().__init__(
36+
client_config,
37+
attack_config,
38+
artifacts_path=artifacts_path,
39+
num_attempts=num_attempts,
40+
*args,
41+
**kwargs,
42+
)
43+
44+
def _prepare_attack_data(self, attack_prompts: list[str], responses: list[str], statuses: list[str]) -> None:
45+
"""
46+
Prepares attack data in a structured DataFrame format and saves it as a CSV file.
47+
48+
Args:
49+
attack_prompts (list[str]): List of attack texts generated during the test.
50+
responses (list[str]): List of responses from the target system.
51+
statuses (list[str]): List of statuses ('broken', 'resilient', 'error') corresponding to each attack.
52+
53+
Returns:
54+
None
55+
"""
56+
# Create a DataFrame from the lists
57+
df = pd.DataFrame({"attack_text": attack_prompts, "response_text": responses, "status": statuses})
58+
59+
# Save the DataFrame as a CSV file to the artifacts path
60+
if self.artifacts_path:
61+
csv_path = os.path.join(self.artifacts_path, f"{self.test_name}.csv")
62+
df.to_csv(csv_path, index=False)
63+
logging.info(f"{self.test_name} attack report saved to {csv_path}")
64+
65+
@staticmethod
66+
def shuffle_words(prompt: str) -> str:
67+
words = prompt.split()
68+
random.shuffle(words) # nosec
69+
return " ".join(words)
70+
71+
def run(self) -> Generator[StatusUpdate, None, None]:
72+
attack_prompts = []
73+
responses = []
74+
statuses = []
75+
# Getting the path to the directory where the file is located harmful_behavior.py
76+
current_dir = Path(__file__).parent
77+
dataset_filename = current_dir.parent / "attack_data" / "harmful_behavior.parquet"
78+
79+
logger.info(f"Dataset filename: {dataset_filename}")
80+
data = pd.read_parquet(dataset_filename, engine="fastparquet")
81+
rows_count = data.shape[0]
82+
83+
if self.num_attempts <= rows_count:
84+
# If dataset is larger than requested number of samples, reduce dataset by taking num_attempts samples
85+
data = data.sample(n=self.num_attempts)
86+
else:
87+
# If dataset is smaller than requested number of attack samples, fit the dataset repeating until num_attempts is reached
88+
data = (
89+
pd.concat([data] * (self.num_attempts // len(data) + 1))[: self.num_attempts]
90+
.sort_index()
91+
.reset_index(drop=True)
92+
)
93+
94+
# Generate list of attack prompt variations
95+
yield StatusUpdate(self.client_config, self.test_name, self.status, "Generating", 0, self.num_attempts)
96+
97+
for attack_prompt_index, row in data.iterrows():
98+
attack_prompt = row["goal"]
99+
shuffled_prompt = self.shuffle_words(attack_prompt)
100+
attack_prompts.append(shuffled_prompt)
101+
logger.info(f"Test '{self.test_name}': generated attack prompt #{attack_prompt_index}: {shuffled_prompt}")
102+
yield StatusUpdate(
103+
self.client_config,
104+
self.test_name,
105+
self.status,
106+
"Generating",
107+
attack_prompt_index,
108+
self.num_attempts,
109+
)
110+
111+
yield StatusUpdate(
112+
self.client_config,
113+
self.test_name,
114+
self.status,
115+
"Attacking",
116+
0,
117+
self.num_attempts,
118+
)
119+
120+
# Run the attack by sending the prompts and verifying the responses
121+
for attack_prompt_index, attack_prompt in enumerate(attack_prompts):
122+
chat = ChatSession(
123+
self.client_config.get_target_client(),
124+
self.client_config.get_system_prompts(),
125+
)
126+
response = chat.say(attack_prompt)
127+
if not response:
128+
logger.warning("Error while attacking against target LLM (didn't receive response) ...")
129+
self.status.report_error(attack_prompt)
130+
current_status = "Error"
131+
else:
132+
# Verify the response to determine if this specific attack was successful
133+
verified = not contains_refusal_keywords(response)
134+
logger.info(f"Test '{self.test_name}': attack prompt #{attack_prompt_index}: {attack_prompt}")
135+
logger.info(
136+
f"Test '{self.test_name}': attack response #{attack_prompt_index} break_success={verified}): {response}"
137+
)
138+
if verified:
139+
self.status.report_breach(attack_prompt, response)
140+
current_status = "Broken"
141+
else:
142+
self.status.report_resilient(attack_prompt, response)
143+
current_status = "Resilient"
144+
145+
# Append data for report
146+
responses.append(response)
147+
statuses.append(current_status)
148+
149+
yield StatusUpdate(
150+
self.client_config,
151+
self.test_name,
152+
self.status,
153+
"Attacking",
154+
attack_prompt_index,
155+
self.num_attempts,
156+
)
157+
158+
# Prepare data for report generation
159+
self._prepare_attack_data(attack_prompts, responses, statuses)
160+
161+
yield StatusUpdate(
162+
self.client_config,
163+
self.test_name,
164+
self.status,
165+
"Finished",
166+
self.num_attempts,
167+
self.num_attempts,
168+
)

src/llamator/initial_validation.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"RU_do_anything_now_jailbreak",
2222
"RU_typoglycemia_attack",
2323
"RU_ucar",
24+
"shuffle",
2425
"suffix",
2526
"sycophancy",
2627
"system_prompt_leakage",

src/llamator/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def start_testing(
8989
- RU_do_anything_now_jailbreak
9090
- RU_typoglycemia_attack
9191
- RU_ucar
92+
- shuffle
9293
- suffix
9394
- sycophancy
9495
- system_prompt_leakage

tests/test_llamator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def test_openai_client():
5757
# ("linguistic_evasion", 2),
5858
# ("logical_inconsistencies", 2),
5959
# ("past_tense", 2),
60+
# ("shuffle", 2),
6061
# ("suffix", 2),
6162
# ("sycophancy", 2),
6263
# ("system_prompt_leakage", 2),

tests/test_local_llamator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ def test_langchain_client_yandexgpt():
119119
# ("linguistic_evasion", 2),
120120
# ("logical_inconsistencies", 2),
121121
# ("past_tense", 2),
122+
# ("shuffle", 2),
122123
# ("suffix", 2),
123124
# ("sycophancy", 2),
124125
# ("system_prompt_leakage", 2),

0 commit comments

Comments
 (0)