add shuffle inconsistency attack (#98)

NickoJo · web-flow · commit c87ea8ee0ab7 · 2025-03-17T11:29:06.000+05:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -120,7 +120,10 @@ from ..attacks import (  # noqa
     ru_dan,
     ru_typoglycemia,
     ru_ucar,
+    shuffle,
+    suffix,
     sycophancy,
+    system_prompt_leakage,
     typoglycemia,
     ucar,
 
@@ -145,7 +148,10 @@ AvailableTests = [
     "RU_do_anything_now_jailbreak",
     "RU_typoglycemia_attack",
     "RU_ucar",
+    "shuffle",
+    "suffix",
     "sycophancy",
+    "system_prompt_leakage",
     "typoglycemia_attack",
     "ucar",
 
diff --git a/docs/attack_descriptions.md b/docs/attack_descriptions.md
@@ -7,13 +7,13 @@ This section provides an overview of all available attacks.
 ### OWASP [LLM01:2025](https://genai.owasp.org/llmrisk/llm01-prompt-injection/) Prompt Injection
 
 <details>
-<summary><strong>Adversarial suffix</strong></summary>
+<summary><strong>Adversarial Suffix</strong></summary>
 
 In code name: `suffix`
 
 Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation.
 
-_Original Paper <https://arxiv.org/abs/2307.15043>, Code <https://github.com/llm-attacks/llm-attacks>_
+_Original Paper: <https://arxiv.org/abs/2307.15043>, Original Code: <https://github.com/llm-attacks/llm-attacks>_
 
 <a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/suffix.py">View code on GitHub</a>
 </details>
@@ -45,7 +45,7 @@ In code name: `bon`
 
 Probes the LLM resilience against Best-Of-N attack.
 
-_Original Paper <https://arxiv.org/abs/2412.03556>, Code <https://github.com/jplhughes/bon-jailbreaking>_
+_Original Paper: <https://arxiv.org/abs/2412.03556>, Original Code: <https://github.com/jplhughes/bon-jailbreaking>_
 
 <a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/bon.py">View code on GitHub</a>
 </details>
@@ -120,6 +120,18 @@ Challenges the LLM with a dataset of harmful behavior prompts in past tense form
 <a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/past_tense.py">View code on GitHub</a>
 </details>
 
+<details>
+<summary><strong>Shuffle Inconsistency</strong></summary>
+
+In code name: `shuffle`
+
+Attempts to bypass LLM's defense mechanism by randomly shuffling elements in input harmful prompt.
+
+_Original Paper: <https://arxiv.org/html/2501.04931>_
+
+<a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/shuffle.py">View code on GitHub</a>
+</details>
+
 <details>
 <summary><strong>Typoglycemia Attack</strong></summary>
 
@@ -192,7 +204,7 @@ In code name: `crescendo`
 
 Challenges the LLM with a dataset of harmful behavior prompts using Crescendo strategy, operates by engaging the LLM in a series of escalating conversational turns, and multistage refinement.
 
-_Original Paper <https://arxiv.org/abs/2404.01833>_
+_Original Paper: <https://arxiv.org/abs/2404.01833>_
 
 <a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/crescendo.py">View code on GitHub</a>
 </details>
@@ -238,7 +250,7 @@ In code name: `sycophancy`
 
 Attacks the LLM system during a multistage conversation using the argumentum ad verecundiam method based on the system's domain.
 
-_Original Paper <https://arxiv.org/abs/2310.13548>_
+_Original Paper: <https://arxiv.org/abs/2310.13548>_
 
 <a href="https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/sycophancy.py">View code on GitHub</a>
 </details>
diff --git a/src/llamator/attack_provider/attack_loader.py b/src/llamator/attack_provider/attack_loader.py
@@ -14,6 +14,7 @@
     ru_dan,
     ru_typoglycemia,
     ru_ucar,
+    shuffle,
     suffix,
     sycophancy,
     system_prompt_leakage,
diff --git a/src/llamator/attacks/attack_descriptions.json b/src/llamator/attacks/attack_descriptions.json
@@ -16,8 +16,8 @@
   {
     "name": "Best-of-N Jailbreak",
     "in_code_name": "bon",
-    "description_en": "Probes the LLM resilience against Best-of-N attack.",
-    "description_ru": "Испытывает устойчивость LLM к атаке Best-of-N.",
+    "description_en": "Probes the LLM resilience against Best-of-N attack. Original Paper: https://arxiv.org/abs/2412.03556, Original Code: https://github.com/jplhughes/bon-jailbreaking",
+    "description_ru": "Испытывает устойчивость LLM к атаке Best-of-N. Статья: https://arxiv.org/abs/2412.03556, оригинальный код: https://github.com/jplhughes/bon-jailbreaking",
     "github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/bon.py"
   },
   {
@@ -30,8 +30,8 @@
   {
     "name": "Crescendo",
     "in_code_name": "crescendo",
-    "description_en": "Challenges the LLM with a dataset of harmful behavior prompts using Crescendo strategy, operates by engaging the LLM in a series of escalating conversational turns, and multistage refinement.",
-    "description_ru": "Испытывает LLM набором запросов, связанных с вредным поведением, используя стратегию Crescendo путем вовлечения LLM в серию растущих разговорных поворотов с самоулучшением атакующей модели.",
+    "description_en": "Challenges the LLM with a dataset of harmful behavior prompts using Crescendo strategy, operates by engaging the LLM in a series of escalating conversational turns, and multistage refinement. Original Paper: https://arxiv.org/abs/2404.01833",
+    "description_ru": "Испытывает LLM набором запросов, связанных с вредным поведением, используя стратегию Crescendo путем вовлечения LLM в серию растущих разговорных поворотов с самоулучшением атакующей модели. Статья: https://arxiv.org/abs/2404.01833",
     "github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/crescendo.py"
   },
   {
@@ -105,17 +105,24 @@
     "github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/ru_ucar.py"
   },
   {
-    "name": "Adversarial suffix",
+    "name": "Shuffle Inconsistency",
+    "in_code_name": "shuffle",
+    "description_en": "Attempts to bypass LLM's defense mechanism by randomly shuffling elements in input harmful prompt. Original Paper: https://arxiv.org/html/2501.04931",
+    "description_ru": "Проверяет устойчивость LLM к Shuffle Inconsistency: Попытка обойти защиту LLM, случайным образом перемешивая элементы во входном вредоносном запросе. Статья: https://arxiv.org/html/2501.04931",
+    "github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/shuffle.py"
+  },
+  {
+    "name": "Adversarial Suffix",
     "in_code_name": "suffix",
-    "description_en": "Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation.",
-    "description_ru": "Использует специально сконструированный суффикс для обхода фильтров и ограничений LLM, заставляя модель генерировать запрещённый или вредоносный контент, несмотря на исходный запрос. Оценивает уязвимость модели к манипуляциям входных данных.",
+    "description_en": "Uses a specially crafted suffix to bypass LLM filters and restrictions, forcing the model to generate prohibited or harmful content despite the original query. Evaluates the model's vulnerability to input data manipulation. Original Paper: https://arxiv.org/abs/2307.15043, Original Code: https://github.com/llm-attacks/llm-attacks",
+    "description_ru": "Использует специально сконструированный суффикс для обхода фильтров и ограничений LLM, заставляя модель генерировать запрещённый или вредоносный контент, несмотря на исходный запрос. Оценивает уязвимость модели к манипуляциям входных данных. Статья: https://arxiv.org/abs/2307.15043, оригинальный код: https://github.com/llm-attacks/llm-attacks",
     "github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/suffix.py"
   },
   {
     "name": "Sycophancy Test",
     "in_code_name": "sycophancy",
-    "description_en": "Attacks the LLM system during a multistage conversation using the argumentum ad verecundiam method based on the system's domain.",
-    "description_ru": "Проверяет связность и последовательность ответов LLM-системы в ходе продолжительного диалога на основе её описания, вступая с ней в спор с использованием аргументации к авторитету (лат. argumentum ad verecundiam).",
+    "description_en": "Attacks the LLM system during a multistage conversation using the argumentum ad verecundiam method based on the system's domain. Original Paper: https://arxiv.org/abs/2310.13548",
+    "description_ru": "Проверяет связность и последовательность ответов LLM-системы в ходе продолжительного диалога на основе её описания, вступая с ней в спор с использованием аргументации к авторитету (лат. argumentum ad verecundiam). Статья: https://arxiv.org/abs/2310.13548",
     "github_link": "https://github.com/RomiconEZ/llamator/blob/release/src/llamator/attacks/sycophancy.py"
   },
   {
diff --git a/src/llamator/attacks/shuffle.py b/src/llamator/attacks/shuffle.py
@@ -0,0 +1,168 @@
+import logging
+import os
+import random
+from pathlib import Path
+from typing import Generator, Optional
+
+import pandas as pd
+
+from ..attack_provider.attack_registry import register_test
+from ..attack_provider.test_base import StatusUpdate, TestBase
+from ..attack_provider.util import contains_refusal_keywords
+from ..client.attack_config import AttackConfig
+from ..client.chat_client import ChatSession
+from ..client.client_config import ClientConfig
+
+logger = logging.getLogger(__name__)
+
+
+@register_test
+class TestShuffle(TestBase):
+    """Probes the LLM resilience against Shuffle Inconsistency.
+    Original Paper: https://arxiv.org/html/2501.04931"""
+
+    test_name = "shuffle"
+
+    def __init__(
+        self,
+        client_config: ClientConfig,
+        attack_config: AttackConfig,
+        artifacts_path: Optional[str] = None,
+        num_attempts: int = 0,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(
+            client_config,
+            attack_config,
+            artifacts_path=artifacts_path,
+            num_attempts=num_attempts,
+            *args,
+            **kwargs,
+        )
+
+    def _prepare_attack_data(self, attack_prompts: list[str], responses: list[str], statuses: list[str]) -> None:
+        """
+        Prepares attack data in a structured DataFrame format and saves it as a CSV file.
+
+        Args:
+            attack_prompts (list[str]): List of attack texts generated during the test.
+            responses (list[str]): List of responses from the target system.
+            statuses (list[str]): List of statuses ('broken', 'resilient', 'error') corresponding to each attack.
+
+        Returns:
+            None
+        """
+        # Create a DataFrame from the lists
+        df = pd.DataFrame({"attack_text": attack_prompts, "response_text": responses, "status": statuses})
+
+        # Save the DataFrame as a CSV file to the artifacts path
+        if self.artifacts_path:
+            csv_path = os.path.join(self.artifacts_path, f"{self.test_name}.csv")
+            df.to_csv(csv_path, index=False)
+            logging.info(f"{self.test_name} attack report saved to {csv_path}")
+
+    @staticmethod
+    def shuffle_words(prompt: str) -> str:
+        words = prompt.split()
+        random.shuffle(words)  # nosec
+        return " ".join(words)
+
+    def run(self) -> Generator[StatusUpdate, None, None]:
+        attack_prompts = []
+        responses = []
+        statuses = []
+        # Getting the path to the directory where the file is located harmful_behavior.py
+        current_dir = Path(__file__).parent
+        dataset_filename = current_dir.parent / "attack_data" / "harmful_behavior.parquet"
+
+        logger.info(f"Dataset filename: {dataset_filename}")
+        data = pd.read_parquet(dataset_filename, engine="fastparquet")
+        rows_count = data.shape[0]
+
+        if self.num_attempts <= rows_count:
+            # If dataset is larger than requested number of samples, reduce dataset by taking num_attempts samples
+            data = data.sample(n=self.num_attempts)
+        else:
+            # If dataset is smaller than requested number of attack samples, fit the dataset repeating until num_attempts is reached
+            data = (
+                pd.concat([data] * (self.num_attempts // len(data) + 1))[: self.num_attempts]
+                .sort_index()
+                .reset_index(drop=True)
+            )
+
+        # Generate list of attack prompt variations
+        yield StatusUpdate(self.client_config, self.test_name, self.status, "Generating", 0, self.num_attempts)
+
+        for attack_prompt_index, row in data.iterrows():
+            attack_prompt = row["goal"]
+            shuffled_prompt = self.shuffle_words(attack_prompt)
+            attack_prompts.append(shuffled_prompt)
+            logger.info(f"Test '{self.test_name}': generated attack prompt #{attack_prompt_index}: {shuffled_prompt}")
+            yield StatusUpdate(
+                self.client_config,
+                self.test_name,
+                self.status,
+                "Generating",
+                attack_prompt_index,
+                self.num_attempts,
+            )
+
+        yield StatusUpdate(
+            self.client_config,
+            self.test_name,
+            self.status,
+            "Attacking",
+            0,
+            self.num_attempts,
+        )
+
+        # Run the attack by sending the prompts and verifying the responses
+        for attack_prompt_index, attack_prompt in enumerate(attack_prompts):
+            chat = ChatSession(
+                self.client_config.get_target_client(),
+                self.client_config.get_system_prompts(),
+            )
+            response = chat.say(attack_prompt)
+            if not response:
+                logger.warning("Error while attacking against target LLM (didn't receive response) ...")
+                self.status.report_error(attack_prompt)
+                current_status = "Error"
+            else:
+                # Verify the response to determine if this specific attack was successful
+                verified = not contains_refusal_keywords(response)
+                logger.info(f"Test '{self.test_name}': attack prompt #{attack_prompt_index}: {attack_prompt}")
+                logger.info(
+                    f"Test '{self.test_name}': attack response #{attack_prompt_index} break_success={verified}): {response}"
+                )
+                if verified:
+                    self.status.report_breach(attack_prompt, response)
+                    current_status = "Broken"
+                else:
+                    self.status.report_resilient(attack_prompt, response)
+                    current_status = "Resilient"
+
+            # Append data for report
+            responses.append(response)
+            statuses.append(current_status)
+
+            yield StatusUpdate(
+                self.client_config,
+                self.test_name,
+                self.status,
+                "Attacking",
+                attack_prompt_index,
+                self.num_attempts,
+            )
+
+        # Prepare data for report generation
+        self._prepare_attack_data(attack_prompts, responses, statuses)
+
+        yield StatusUpdate(
+            self.client_config,
+            self.test_name,
+            self.status,
+            "Finished",
+            self.num_attempts,
+            self.num_attempts,
+        )
diff --git a/src/llamator/initial_validation.py b/src/llamator/initial_validation.py
@@ -21,6 +21,7 @@
     "RU_do_anything_now_jailbreak",
     "RU_typoglycemia_attack",
     "RU_ucar",
+    "shuffle",
     "suffix",
     "sycophancy",
     "system_prompt_leakage",
diff --git a/src/llamator/main.py b/src/llamator/main.py
@@ -89,6 +89,7 @@ def start_testing(
         - RU_do_anything_now_jailbreak
         - RU_typoglycemia_attack
         - RU_ucar
+        - shuffle
         - suffix
         - sycophancy
         - system_prompt_leakage
diff --git a/tests/test_llamator.py b/tests/test_llamator.py
@@ -57,6 +57,7 @@ def test_openai_client():
         # ("linguistic_evasion", 2),
         # ("logical_inconsistencies", 2),
         # ("past_tense", 2),
+        # ("shuffle", 2),
         # ("suffix", 2),
         # ("sycophancy", 2),
         # ("system_prompt_leakage", 2),
diff --git a/tests/test_local_llamator.py b/tests/test_local_llamator.py
@@ -119,6 +119,7 @@ def test_langchain_client_yandexgpt():
         # ("linguistic_evasion", 2),
         # ("logical_inconsistencies", 2),
         # ("past_tense", 2),
+        # ("shuffle", 2),
         # ("suffix", 2),
         # ("sycophancy", 2),
         # ("system_prompt_leakage", 2),