1+ from _types import AttackType , Focus
12from openai import OpenAI
23
3- from _types import Focus
4-
54OpenAIClient = OpenAI
65
76SUMMARIZE_ABSTRACT_PROMPT = """\
2019Respond with ONLY ONE of the labels above. Do not include anything else in your response.
2120"""
2221
22+ # Attack Type descriptions
23+
24+ EVASION_DESCRIPTION = """\
25+ Model Evasion is an adversarial attack aimed at bypassing or evading a machine
26+ learning model's defenses, usually to make it produce incorrect outputs or behave
27+ in ways that favor the attacker. In this context, the adversary doesn't try to
28+ "break" the model or extract data from it (like in model inversion) but instead
29+ seeks to manipulate the model's behavior in a way that allows them to achieve a
30+ desired outcome, such as bypassing detection systems or generating misleading predictions.
31+ """
32+
33+ EXTRACTION_DESCRIPTION = """\
34+ Model Extraction refers to an attack where an adversary tries to replicate or steal
35+ the functionality of a machine learning model by querying it and using the outputs
36+ to build a copy of the original model. This type of attack doesn't necessarily involve
37+ extracting sensitive data used for training, as in model inversion, but instead focuses
38+ on how the model behaves—its predictions and outputs—in order to create a surrogate or
39+ shadow model that behaves similarly to the original.
40+ """
41+
42+ INVERSION_DESCRIPTION = """\
43+ Model inversion refers to a set of techniques in machine learning where an attacker
44+ tries to extract confidential information from a trained AI model by interacting with
45+ it in specific ways, often through extensive querying. By doing so, the attacker may
46+ be able to infer details about the data used to train the model. These details can
47+ range from personal information to the reconstruction of private or sensitive datasets,
48+ potentially revealing confidential information.
49+ """
50+
51+ POISONING_DESCRIPTION = """\
52+ Model Poisoning is an attack on machine learning models where an adversary intentionally
53+ manipulates data in the training set to impact how a model behaves. Unlike attacks like
54+ model inversion or model extraction, which focus on extracting information from the model,
55+ model poisoning targets the model during its training phase. By introducing misleading,
56+ incorrect, or adversarial data, attackers can manipulate a model's behavior, often without
57+ detection, leading to significant security, reliability, and ethical risks.
58+ """
59+
60+ PROMPT_INJECTION_DESCRIPTION = """\
61+ Prompt injection is a critical vulnerability in Large Language Models (LLMs), where malicious
62+ users manipulate model behavior by crafting inputs that override, bypass, or exploit how the
63+ model follows instructions. This vulnerability has become more pronounced with the widespread
64+ use of generative AI systems, enabling attackers to induce unintended responses that may lead
65+ to data leakage, misinformation, or system disruptions.
66+ """
67+
68+
69+ ATTACK_TYPE_DESCRIPTIONS : dict [AttackType , str ] = {
70+ AttackType .ModelEvasion : EVASION_DESCRIPTION ,
71+ AttackType .ModelExtraction : EXTRACTION_DESCRIPTION ,
72+ AttackType .ModelInversion : INVERSION_DESCRIPTION ,
73+ AttackType .ModelPoisoning : POISONING_DESCRIPTION ,
74+ AttackType .PromptInjection : PROMPT_INJECTION_DESCRIPTION ,
75+ AttackType .Other : "None of the above" ,
76+ }
77+
78+ ASSIGN_ATTACK_TYPE_PROMPT = """\
79+ You will be provided with an abstract of a scientific paper. \
80+ Assess the most applicable attack type label based on the \
81+ research focus, produced materials, and key outcomes.
82+
83+ {types}
84+
85+ If you feel like none of the types apply, you can respond with "Other".
86+
87+ Respond with ONLY ONE of the labels above. Do not include anything else in your response.
88+ """
89+
90+ # Model Evasion
91+ # Model Extraction
92+ # Model Inversion
93+ # Model Poisoning
94+ # Prompt Injection
95+
2396def get_openai_client (token : str ) -> OpenAIClient :
2497 return OpenAI (api_key = token )
2598
2699
27100def summarize_abstract_with_openai (client : OpenAIClient , abstract : str ) -> str :
28101 response = client .chat .completions .create (
29- model = "gpt-3.5-turbo " ,
102+ model = "gpt-4o-mini " ,
30103 messages = [
31104 {"role" : "system" , "content" : SUMMARIZE_ABSTRACT_PROMPT },
32105 {"role" : "user" , "content" : f"{ abstract } " },
@@ -35,7 +108,8 @@ def summarize_abstract_with_openai(client: OpenAIClient, abstract: str) -> str:
35108 max_tokens = 100 ,
36109 )
37110
38- return response .choices [0 ].message .content .strip () # type: ignore
111+ return response .choices [0 ].message .content .strip () # type: ignore
112+
39113
40114def get_focus_label_from_abstract (client : OpenAIClient , abstract : str ) -> Focus | None :
41115 system_prompt = ASSIGN_LABEL_PROMPT .format (
@@ -52,8 +126,32 @@ def get_focus_label_from_abstract(client: OpenAIClient, abstract: str) -> Focus
52126 max_tokens = 10 ,
53127 )
54128
55- content = response .choices [0 ].message .content .strip () # type: ignore
129+ content = response .choices [0 ].message .content .strip () # type: ignore
56130 if content not in [f .value for f in Focus ]:
57131 return None
58-
59- return Focus (content )
132+
133+ return Focus (content )
134+
135+ def get_attack_type_from_abstract (client : OpenAIClient , abstract : str ) -> AttackType | None :
136+ system_prompt = ASSIGN_ATTACK_TYPE_PROMPT .format (
137+ types = "\n " .join ([f"- `{ t .value } `: { ATTACK_TYPE_DESCRIPTIONS [t ]} " for t in AttackType ])
138+ )
139+
140+ response = client .chat .completions .create (
141+ model = "gpt-3.5-turbo" ,
142+ messages = [
143+ {"role" : "system" , "content" : system_prompt },
144+ {"role" : "user" , "content" : f"{ abstract } " },
145+ ],
146+ temperature = 0.5 ,
147+ max_tokens = 10 ,
148+ )
149+
150+ content = response .choices [0 ].message .content .strip () # type: ignore
151+ content = content .strip ("`" )
152+
153+ if content not in [t .value for t in AttackType ]:
154+ print (f"Invalid attack type: { content } " )
155+ return None
156+
157+ return AttackType (content )
0 commit comments