@@ -31,7 +31,45 @@ def load_model_and_tokenizer(model_name='meta-llama/Prompt-Guard-86M'):
31
31
return model , tokenizer
32
32
33
33
34
- def get_class_probabilities (model , tokenizer , text , temperature = 1.0 , device = 'cpu' ):
34
+ def preprocess_text_for_promptguard (text : str , tokenizer ) -> str :
35
+ """
36
+ Preprocess the text by removing spaces that break apart larger tokens.
37
+ This hotfixes a workaround to PromptGuard, where spaces can be inserted into a string
38
+ to allow the string to be classified as benign.
39
+
40
+ Args:
41
+ text (str): The input text to preprocess.
42
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
43
+
44
+ Returns:
45
+ str: The preprocessed text.
46
+ """
47
+
48
+ try :
49
+ cleaned_text = ''
50
+ index_map = []
51
+ for i , char in enumerate (text ):
52
+ if not char .isspace ():
53
+ cleaned_text += char
54
+ index_map .append (i )
55
+ tokens = tokenizer .tokenize (cleaned_text )
56
+ result = []
57
+ last_end = 0
58
+ for token in tokens :
59
+ token_str = tokenizer .convert_tokens_to_string ([token ])
60
+ start = cleaned_text .index (token_str , last_end )
61
+ end = start + len (token_str )
62
+ original_start = index_map [start ]
63
+ if original_start > 0 and text [original_start - 1 ].isspace ():
64
+ result .append (' ' )
65
+ result .append (token_str )
66
+ last_end = end
67
+ return '' .join (result )
68
+ except Exception :
69
+ return text
70
+
71
+
72
+ def get_class_probabilities (model , tokenizer , text , temperature = 1.0 , device = 'cpu' , preprocess = True ):
35
73
"""
36
74
Evaluate the model on the given text with temperature-adjusted softmax.
37
75
Note, as this is a DeBERTa model, the input text should have a maximum length of 512.
@@ -44,6 +82,8 @@ def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu
44
82
Returns:
45
83
torch.Tensor: The probability of each class adjusted by the temperature.
46
84
"""
85
+ if preprocess :
86
+ text = preprocess_text_for_promptguard (text , tokenizer )
47
87
# Encode the text
48
88
inputs = tokenizer (text , return_tensors = "pt" , padding = True , truncation = True , max_length = 512 )
49
89
inputs = inputs .to (device )
@@ -57,7 +97,7 @@ def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu
57
97
return probabilities
58
98
59
99
60
- def get_jailbreak_score (model , tokenizer , text , temperature = 1.0 , device = 'cpu' ):
100
+ def get_jailbreak_score (model , tokenizer , text , temperature = 1.0 , device = 'cpu' , preprocess = True ):
61
101
"""
62
102
Evaluate the probability that a given string contains malicious jailbreak or prompt injection.
63
103
Appropriate for filtering dialogue between a user and an LLM.
@@ -70,11 +110,11 @@ def get_jailbreak_score(model, tokenizer, text, temperature=1.0, device='cpu'):
70
110
Returns:
71
111
float: The probability of the text containing malicious content.
72
112
"""
73
- probabilities = get_class_probabilities (model , tokenizer , text , temperature , device )
113
+ probabilities = get_class_probabilities (model , tokenizer , text , temperature , device , preprocess )
74
114
return probabilities [0 , 2 ].item ()
75
115
76
116
77
- def get_indirect_injection_score (model , tokenizer , text , temperature = 1.0 , device = 'cpu' ):
117
+ def get_indirect_injection_score (model , tokenizer , text , temperature = 1.0 , device = 'cpu' , preprocess = True ):
78
118
"""
79
119
Evaluate the probability that a given string contains any embedded instructions (malicious or benign).
80
120
Appropriate for filtering third party inputs (e.g. web searches, tool outputs) into an LLM.
@@ -87,11 +127,11 @@ def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device
87
127
Returns:
88
128
float: The combined probability of the text containing malicious or embedded instructions.
89
129
"""
90
- probabilities = get_class_probabilities (model , tokenizer , text , temperature , device )
130
+ probabilities = get_class_probabilities (model , tokenizer , text , temperature , device , preprocess )
91
131
return (probabilities [0 , 1 ] + probabilities [0 , 2 ]).item ()
92
132
93
133
94
- def process_text_batch (model , tokenizer , texts , temperature = 1.0 , device = 'cpu' ):
134
+ def process_text_batch (model , tokenizer , texts , temperature = 1.0 , device = 'cpu' , preprocess = True ):
95
135
"""
96
136
Process a batch of texts and return their class probabilities.
97
137
Args:
@@ -104,6 +144,8 @@ def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
104
144
Returns:
105
145
torch.Tensor: A tensor containing the class probabilities for each text in the batch.
106
146
"""
147
+ if preprocess :
148
+ texts = [preprocess_text_for_promptguard (text , tokenizer ) for text in texts ]
107
149
inputs = tokenizer (texts , return_tensors = "pt" , padding = True , truncation = True , max_length = 512 )
108
150
inputs = inputs .to (device )
109
151
with torch .no_grad ():
@@ -113,7 +155,7 @@ def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
113
155
return probabilities
114
156
115
157
116
- def get_scores_for_texts (model , tokenizer , texts , score_indices , temperature = 1.0 , device = 'cpu' , max_batch_size = 16 ):
158
+ def get_scores_for_texts (model , tokenizer , texts , score_indices , temperature = 1.0 , device = 'cpu' , max_batch_size = 16 , preprocess = True ):
117
159
"""
118
160
Compute scores for a list of texts, handling texts of arbitrary length by breaking them into chunks and processing in parallel.
119
161
Args:
@@ -138,15 +180,15 @@ def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0
138
180
for i in range (0 , len (all_chunks ), max_batch_size ):
139
181
batch_chunks = all_chunks [i :i + max_batch_size ]
140
182
batch_indices = text_indices [i :i + max_batch_size ]
141
- probabilities = process_text_batch (model , tokenizer , batch_chunks , temperature , device )
183
+ probabilities = process_text_batch (model , tokenizer , batch_chunks , temperature , device , preprocess )
142
184
scores = probabilities [:, score_indices ].sum (dim = 1 ).tolist ()
143
185
144
186
for idx , score in zip (batch_indices , scores ):
145
187
all_scores [idx ] = max (all_scores [idx ], score )
146
188
return all_scores
147
189
148
190
149
- def get_jailbreak_scores_for_texts (model , tokenizer , texts , temperature = 1.0 , device = 'cpu' , max_batch_size = 16 ):
191
+ def get_jailbreak_scores_for_texts (model , tokenizer , texts , temperature = 1.0 , device = 'cpu' , max_batch_size = 16 , preprocess = True ):
150
192
"""
151
193
Compute jailbreak scores for a list of texts.
152
194
Args:
@@ -160,10 +202,10 @@ def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, dev
160
202
Returns:
161
203
list[float]: A list of jailbreak scores for each text.
162
204
"""
163
- return get_scores_for_texts (model , tokenizer , texts , [2 ], temperature , device , max_batch_size )
205
+ return get_scores_for_texts (model , tokenizer , texts , [2 ], temperature , device , max_batch_size , preprocess )
164
206
165
207
166
- def get_indirect_injection_scores_for_texts (model , tokenizer , texts , temperature = 1.0 , device = 'cpu' , max_batch_size = 16 ):
208
+ def get_indirect_injection_scores_for_texts (model , tokenizer , texts , temperature = 1.0 , device = 'cpu' , max_batch_size = 16 , preprocess = True ):
167
209
"""
168
210
Compute indirect injection scores for a list of texts.
169
211
Args:
@@ -177,4 +219,4 @@ def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature
177
219
Returns:
178
220
list[float]: A list of indirect injection scores for each text.
179
221
"""
180
- return get_scores_for_texts (model , tokenizer , texts , [1 , 2 ], temperature , device , max_batch_size )
222
+ return get_scores_for_texts (model , tokenizer , texts , [1 , 2 ], temperature , device , max_batch_size , preprocess )
0 commit comments