Add Attack Type

monoxgas · monoxgas · commit 43f67694556c · 2025-02-01T13:30:43.000-07:00
diff --git a/_types.py b/_types.py
@@ -11,6 +11,14 @@ class Focus(str, Enum):
     Safety = "Safety"
     Other = "Other"
 
+class AttackType(str, Enum):
+    ModelEvasion = "Evasion"
+    ModelExtraction = "Extraction"
+    ModelInversion = "Inversion"
+    ModelPoisoning = "Poisoning"
+    PromptInjection = "Prompt Injection"
+    Other = "Other"
+
 
 @dataclass
 class Paper:
@@ -21,6 +29,7 @@ class Paper:
     title: str | None = None
     url: str | None = None
     focus: Focus | None = None
+    attack_type: AttackType | None = None
     summary: str | None = None
     abstract: str | None = None
     authors: list[str] = field(default_factory=list)
diff --git a/notion_utils.py b/notion_utils.py
@@ -6,7 +6,7 @@
 from notion_client.helpers import async_collect_paginated_api
 from tqdm import tqdm  # type: ignore
 
-from _types import Paper, Focus
+from _types import AttackType, Paper, Focus
 
 NotionClient = AsyncClient
 
@@ -15,10 +15,14 @@ def get_notion_client(token: str) -> NotionClient:
     return NotionClient(auth=token)
 
 
-async def get_papers_from_notion(client: NotionClient, database_id: str) -> list[Paper]:
-    results = await async_collect_paginated_api(
-        client.databases.query, database_id=database_id
-    )
+async def get_papers_from_notion(client: NotionClient, database_id: str, *, max: int | None = None) -> list[Paper]:
+    if max:
+        results = await client.databases.query(database_id=database_id, page_size=max)
+        results = results['results']
+    else:
+        results = await async_collect_paginated_api(
+            client.databases.query, database_id=database_id
+        )
 
     papers: list[Paper] = []
     for result in results:
@@ -35,6 +39,8 @@ async def get_papers_from_notion(client: NotionClient, database_id: str) -> list
         published = datetime.fromisoformat(published["start"]) if published else None
         focus = properties["Focus"]["select"]
         focus = Focus(focus["name"]) if focus else None
+        attack_type = properties["Attack Type"]["select"]
+        attack_type = AttackType(attack_type["name"]) if attack_type else None
         explored = properties["Explored"]["checkbox"]
 
         if not any([url, title]):
@@ -46,6 +52,7 @@ async def get_papers_from_notion(client: NotionClient, database_id: str) -> list
                 title=title,
                 url=url,
                 focus=focus,
+                attack_type=attack_type,
                 summary=summary,
                 authors=authors,
                 published=published,
@@ -62,23 +69,25 @@ async def write_papers_to_notion(
 ) -> None:
     for paper in tqdm(papers):
         properties: dict[str, t.Any] = {}
-        if paper.title:
+        if paper.title and paper._original_state["title"] != paper.title:
             properties["Title"] = {"title": [{"text": {"content": paper.title}}]}
-        if paper.url:
+        if paper.url and paper._original_state["url"] != paper.url:
             properties["URL"] = {"url": paper.url}
-        if paper.summary:
+        if paper.summary and paper._original_state["summary"] != paper.summary:
             properties["Summary"] = {
                 "rich_text": [{"text": {"content": paper.summary}}]
             }
-        if paper.authors:
+        if paper.authors and paper._original_state["authors"] != paper.authors:
             properties["Authors"] = {
-                "multi_select": [{"name": author} for author in paper.authors]
+                "multi_select": [{"name": author} for author in paper.authors[:5]] # Limit to 5 authors
             }
-        if paper.published:
+        if paper.published and paper._original_state["published"] != paper.published:
             properties["Published"] = {"date": {"start": paper.published.isoformat()}}
-        if paper.focus:
+        if paper.focus and paper._original_state["focus"] != paper.focus:
             properties["Focus"] = {"select": {"name": paper.focus.value}}
-        if paper.explored:
+        if paper.attack_type and paper._original_state["attack_type"] != paper.attack_type:
+            properties["Attack Type"] = {"select": {"name": paper.attack_type.value}}
+        if paper.explored and paper._original_state["explored"] != paper.explored:
             properties["Explored"] = {"checkbox": paper.explored}
 
         if paper.page_id:
diff --git a/openai_utils.py b/openai_utils.py
@@ -1,7 +1,6 @@
+from _types import AttackType, Focus
 from openai import OpenAI
 
-from _types import Focus
-
 OpenAIClient = OpenAI
 
 SUMMARIZE_ABSTRACT_PROMPT = """\
@@ -20,13 +19,87 @@
 Respond with ONLY ONE of the labels above. Do not include anything else in your response.
 """
 
+# Attack Type descriptions
+
+EVASION_DESCRIPTION = """\
+Model Evasion is an adversarial attack aimed at bypassing or evading a machine 
+learning model's defenses, usually to make it produce incorrect outputs or behave 
+in ways that favor the attacker. In this context, the adversary doesn't try to 
+"break" the model or extract data from it (like in model inversion) but instead 
+seeks to manipulate the model's behavior in a way that allows them to achieve a 
+desired outcome, such as bypassing detection systems or generating misleading predictions.
+"""
+
+EXTRACTION_DESCRIPTION = """\
+Model Extraction refers to an attack where an adversary tries to replicate or steal 
+the functionality of a machine learning model by querying it and using the outputs 
+to build a copy of the original model. This type of attack doesn't necessarily involve 
+extracting sensitive data used for training, as in model inversion, but instead focuses 
+on how the model behaves—its predictions and outputs—in order to create a surrogate or 
+shadow model that behaves similarly to the original.
+"""
+
+INVERSION_DESCRIPTION = """\
+Model inversion refers to a set of techniques in machine learning where an attacker
+tries to extract confidential information from a trained AI model by interacting with
+it in specific ways, often through extensive querying. By doing so, the attacker may
+be able to infer details about the data used to train the model. These details can
+range from personal information to the reconstruction of private or sensitive datasets,
+potentially revealing confidential information.
+"""
+
+POISONING_DESCRIPTION = """\
+Model Poisoning is an attack on machine learning models where an adversary intentionally
+manipulates data in the training set to impact how a model behaves. Unlike attacks like
+model inversion or model extraction, which focus on extracting information from the model,
+model poisoning targets the model during its training phase. By introducing misleading,
+incorrect, or adversarial data, attackers can manipulate a model's behavior, often without
+detection, leading to significant security, reliability, and ethical risks.
+"""
+
+PROMPT_INJECTION_DESCRIPTION = """\
+Prompt injection is a critical vulnerability in Large Language Models (LLMs), where malicious
+users manipulate model behavior by crafting inputs that override, bypass, or exploit how the
+model follows instructions. This vulnerability has become more pronounced with the widespread
+use of generative AI systems, enabling attackers to induce unintended responses that may lead
+to data leakage, misinformation, or system disruptions.
+"""
+
+
+ATTACK_TYPE_DESCRIPTIONS: dict[AttackType, str] = {
+    AttackType.ModelEvasion: EVASION_DESCRIPTION,
+    AttackType.ModelExtraction: EXTRACTION_DESCRIPTION,
+    AttackType.ModelInversion: INVERSION_DESCRIPTION,
+    AttackType.ModelPoisoning: POISONING_DESCRIPTION,
+    AttackType.PromptInjection: PROMPT_INJECTION_DESCRIPTION,
+    AttackType.Other: "None of the above",
+}
+
+ASSIGN_ATTACK_TYPE_PROMPT = """\
+You will be provided with an abstract of a scientific paper. \
+Assess the most applicable attack type label based on the \
+research focus, produced materials, and key outcomes.
+
+{types}
+
+If you feel like none of the types apply, you can respond with "Other".
+
+Respond with ONLY ONE of the labels above. Do not include anything else in your response.
+"""
+
+# Model Evasion
+# Model Extraction
+# Model Inversion
+# Model Poisoning
+# Prompt Injection
+
 def get_openai_client(token: str) -> OpenAIClient:
     return OpenAI(api_key=token)
 
 
 def summarize_abstract_with_openai(client: OpenAIClient, abstract: str) -> str:
     response = client.chat.completions.create(
-        model="gpt-3.5-turbo",
+        model="gpt-4o-mini",
         messages=[
             {"role": "system", "content": SUMMARIZE_ABSTRACT_PROMPT},
             {"role": "user", "content": f"{abstract}"},
@@ -35,7 +108,8 @@ def summarize_abstract_with_openai(client: OpenAIClient, abstract: str) -> str:
         max_tokens=100,
     )
 
-    return response.choices[0].message.content.strip() # type: ignore
+    return response.choices[0].message.content.strip()  # type: ignore
+
 
 def get_focus_label_from_abstract(client: OpenAIClient, abstract: str) -> Focus | None:
     system_prompt = ASSIGN_LABEL_PROMPT.format(
@@ -52,8 +126,32 @@ def get_focus_label_from_abstract(client: OpenAIClient, abstract: str) -> Focus
         max_tokens=10,
     )
 
-    content = response.choices[0].message.content.strip() # type: ignore
+    content = response.choices[0].message.content.strip()  # type: ignore
     if content not in [f.value for f in Focus]:
         return None
-    
-    return Focus(content)
+
+    return Focus(content)
+
+def get_attack_type_from_abstract(client: OpenAIClient, abstract: str) -> AttackType | None:
+    system_prompt = ASSIGN_ATTACK_TYPE_PROMPT.format(
+        types="\n".join([f"- `{t.value}`: {ATTACK_TYPE_DESCRIPTIONS[t]}" for t in AttackType])
+    )
+
+    response = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": f"{abstract}"},
+        ],
+        temperature=0.5,
+        max_tokens=10,
+    )
+
+    content = response.choices[0].message.content.strip()  # type: ignore
+    content = content.strip("`")
+
+    if content not in [t.value for t in AttackType]:
+        print(f"Invalid attack type: {content}")
+        return None
+
+    return AttackType(content)
diff --git a/paperstack.py b/paperstack.py
@@ -10,6 +10,7 @@
     write_papers_to_notion,
 )
 from openai_utils import (
+    get_attack_type_from_abstract,
     get_focus_label_from_abstract,
     get_openai_client,
     summarize_abstract_with_openai,
@@ -48,7 +49,6 @@ async def main():
     parser.add_argument("--search-semantic-scholar", action="store_true", default=False)
 
     args = parser.parse_args()
-
     print("[+] Paperstack")
 
     notion_client = get_notion_client(args.notion_token)
@@ -62,6 +62,9 @@ async def main():
         if p.published < datetime.fromisoformat("2024-07-01 00:00:00+00:00"):
             p.explored = True
 
+        if len(p.authors) > 5:
+            p.authors = p.authors[:5]
+
     if not all([p.has_arxiv_props() for p in papers]):
         print(" |- Filling in missing data from arXiv")
         papers = fill_papers_with_arxiv(papers)
@@ -70,7 +73,7 @@ async def main():
         print(" |- Searching arXiv for new papers")
         existing_titles = [paper.title for paper in papers]
         for searched_paper in search_arxiv_as_paper(
-            args.arxiv_search_query, max_results=50
+            args.arxiv_search_query, max_results=500
         ):
             if searched_paper.title not in existing_titles:
                 print(f"    |- {searched_paper.title[:50]}...")
@@ -96,10 +99,18 @@ async def main():
 
     if not all([paper.focus for paper in papers]):
         print(" |- Assigning focus labels with OpenAI")
-        for paper in [p for p in papers if not p.focus and p.abstract]:
-            paper.focus = get_focus_label_from_abstract(openai_client, paper.abstract)
+        for paper in [p for p in papers if not p.focus and (p.abstract or p.summary)]:
+            reference = paper.abstract or paper.summary
+            paper.focus = get_focus_label_from_abstract(openai_client, reference)
             print(f"    |- {paper.focus}")
 
+    if not all([paper.attack_type for paper in papers]):
+        print(" |- Assigning attack types with OpenAI")
+        for paper in [p for p in papers if not p.attack_type and (p.abstract or p.summary)]:
+            reference = paper.abstract or paper.summary
+            paper.attack_type = get_attack_type_from_abstract(openai_client, reference)
+            print(f"    |- {paper.attack_type}")
+            
     to_write = [p for p in papers if p.has_changed()]
     if to_write:
         print(f" |- Writing {len(to_write)} updates back to Notion")