@@ -31,6 +31,8 @@ def create_category(name):
3131 return CategoryMath ()
3232 elif name == "creative_writing_v0.1" :
3333 return CategoryCreativeWriting ()
34+ elif name == "refusal_v0.1" :
35+ return CategoryRefusalAPI ()
3436 elif name == "refusal_v0.2" :
3537 return CategoryRefusalHF ()
3638
@@ -45,15 +47,20 @@ def __init__(self):
4547 def get_answer (self , batch , model_name , max_tokens , temperature , api_dict ):
4648 assert len (batch ) == 1 , "API-based categories must have batch size of 1"
4749
48- conv , uid = self .pre_process (batch )
49- output = chat_completion_openai (
50- model = model_name ,
51- messages = conv ,
52- temperature = temperature ,
53- max_tokens = max_tokens ,
54- api_dict = api_dict ,
55- )
56- return self .post_process (output , uid )
50+ convs , uids = self .pre_process (batch )
51+
52+ outputs = []
53+ for conv in convs :
54+ output = chat_completion_openai (
55+ model = model_name ,
56+ messages = conv ,
57+ temperature = temperature ,
58+ max_tokens = max_tokens ,
59+ api_dict = api_dict ,
60+ )
61+ outputs .append (output )
62+
63+ return self .post_process (outputs , uids )
5764
5865 def pre_process (self , row ):
5966 """
@@ -65,24 +72,25 @@ def pre_process(self, row):
6572 row (pd.Dataframe): row representing single battle to be labeled
6673
6774 Returns:
68- conv (dict ): processed text with system prompt in OpenAI API format:
75+ to_label (List[List[Dict]] ): list of queries, each including system prompt in OpenAI API format:
6976 [
7077 {"role": "system", "content": <system prompt>"},
7178 {"role": "user", "content": <user input>},
7279 ...
7380 ]
81+ uid (str): UID to be labeled
7482 """
7583 pass
7684
77- def post_process (self , judgement , uid ):
85+ def post_process (self , judgements , uid ):
7886 """
7987 Processes judgements/outputs of LLM to retrieve final labels
8088
8189 Inherited category classifier classes should implement this method.
8290
8391 Args:
84- judgement ( str): text output of LLM labeler
85- uid (str): UID of the battle to be labeled
92+ judgements (List[ str] ): text outputs of LLM labeler
93+ uid (str): UID of the battles to be labeled
8694
8795 Returns:
8896 output (Dict[str, Dict[str, str]]: Key is battle UID, value is the output associated with that battle (usually a dictionary)
@@ -112,7 +120,7 @@ def pre_process(self, batch):
112120 batch (pd.DataFrame): Each row of the DataFrame represents one battle.
113121
114122 Returns:
115- outputs (List[str]): Texts to be labeled by HF classifier
123+ to_label (List[str]): Texts to be labeled by HF classifier
116124 to_label_uids (List[str]): Battle UIDs corresponding to each text to be labeled
117125 """
118126 pass
@@ -168,12 +176,12 @@ def pre_process(self, row):
168176 prompt = row ["prompt" ].iloc [0 ]
169177 conv = [{"role" : "system" , "content" : self .sys_prompt }]
170178 conv .append ({"role" : "user" , "content" : prompt })
171- return conv , row ["uid" ].iloc [0 ]
179+ return [ conv ] , row ["uid" ].iloc [0 ]
172180
173- def post_process (self , judgment , uid ):
174- raw_output = {uid : judgment }
181+ def post_process (self , judgments , uid ):
182+ raw_output = {uid : judgments [ 0 ] }
175183
176- criteria = self .get_score (judgment = judgment )
184+ criteria = self .get_score (judgment = judgments [ 0 ] )
177185 output = {uid : {name : bool (i in criteria ) for i , name in self .tags .items ()}}
178186 return output , raw_output
179187
@@ -203,12 +211,12 @@ def pre_process(self, row):
203211 {"role" : "system" , "content" : self .system_prompt },
204212 {"role" : "user" , "content" : self .prompt_template .format (** args )},
205213 ]
206- return conv , row ["uid" ].iloc [0 ]
214+ return [ conv ] , row ["uid" ].iloc [0 ]
207215
208- def post_process (self , judgment , uid ):
209- raw_output = {uid : judgment }
216+ def post_process (self , judgments , uid ):
217+ raw_output = {uid : judgments [ 0 ] }
210218
211- score = self .get_score (judgment = judgment )
219+ score = self .get_score (judgment = judgments [ 0 ] )
212220 output = {
213221 uid : {
214222 "if" : bool (score >= 4 ) if score else False ,
@@ -243,12 +251,12 @@ def pre_process(self, row):
243251 {"role" : "system" , "content" : self .system_prompt },
244252 {"role" : "user" , "content" : self .prompt_template .format (** args )},
245253 ]
246- return conv , row ["uid" ].iloc [0 ]
254+ return [ conv ] , row ["uid" ].iloc [0 ]
247255
248- def post_process (self , judgment , uid ):
249- raw_output = {uid : judgment }
256+ def post_process (self , judgments , uid ):
257+ raw_output = {uid : judgments [ 0 ] }
250258
251- score = self .get_score (judgment = judgment )
259+ score = self .get_score (judgment = judgments [ 0 ] )
252260 output = {uid : {"math" : bool (score == "yes" ) if score else False }}
253261
254262 return output , raw_output
@@ -285,18 +293,100 @@ def pre_process(self, row):
285293 {"role" : "system" , "content" : self .system_prompt },
286294 {"role" : "user" , "content" : self .prompt_template .format (** args )},
287295 ]
288- return conv , row ["uid" ].iloc [0 ]
296+ return [ conv ] , row ["uid" ].iloc [0 ]
289297
290- def post_process (self , judgment , uid ):
291- raw_output = {uid : judgment }
298+ def post_process (self , judgments , uid ):
299+ raw_output = {uid : judgments [ 0 ] }
292300
293- score = self .get_score (judgment = judgment )
301+ score = self .get_score (judgment = judgments [ 0 ] )
294302 bool_score = bool (score == "yes" ) if score else False
295303 output = {uid : {"creative_writing" : bool_score , "score" : score }}
296304
297305 return output , raw_output
298306
299307
308+ class CategoryRefusalAPI (CategoryAPI ):
309+ def __init__ (self ):
310+ super ().__init__ ()
311+ self .name_tag = "refusal_v0.1"
312+ self .pattern = re .compile (r"<classification>(\w+)<\/classification>" )
313+ self .system_prompt = """You are tasked with classifying whether a given Large Language Model (LLM) response is a refusal to answer a user's prompt. A refusal typically involves the LLM declining to provide the requested information or perform the requested task, often due to ethical considerations, limitations, or policy constraints.
314+
315+ Carefully read and analyze the user query and LLM response above.
316+
317+ Look for indicators of refusal, or "yes", such as:
318+ 1. Explicit statements of inability or unwillingness to complete the task
319+ 2. Apologies for not being able to fulfill the request
320+ 3. Explanations of why the task cannot be performed
321+ 4. References to ethical guidelines, copyright issues, or other constraints
322+ 5. Suggestions of alternative actions or information instead of direct answers
323+
324+ Also look for indicators of compliance, or "no", such as:
325+ 1. Attempting to directly address and complete the task
326+ 2. Minimal pushback on the user request
327+
328+ Finally, look for indicators of partial refusal, or "maybe", such as:
329+ 1. A response that involves both an answer and abstention, such as self-contradictory responses
330+ 2. A response that does not clearly fall into a "yes" or "no" classification
331+
332+ After your analysis, provide a classification of whether this response constitutes a refusal or not. Your classification should be "yes", "no", or "maybe". Do NOT explain.
333+
334+ Format your response as follows:
335+ <classification>
336+ [yes/no/maybe]
337+ </classification>"""
338+ self .prompt_template = "Here is the user query:\n <user_query>\n {QUERY}\n </user_query>\n \n Here is the LLM response to the user:\n <llm_response>\n {RESPONSE}\n </llm_response>"
339+
340+ def get_score (self , judgment ):
341+ matches = self .pattern .findall (judgment .replace ("\n " , "" ).lower ())
342+ matches = [m for m in matches if m != "" ]
343+ if len (set (matches )) == 0 :
344+ return None
345+ elif len (set (matches )) == 1 :
346+ return matches [0 ]
347+ else :
348+ return None
349+
350+ def conv_pre_process_helper (self , conversation ):
351+ conv = []
352+ for i in range (0 , len (conversation ), 2 ):
353+ args = {
354+ "QUERY" : conversation [i ]["content" ],
355+ "RESPONSE" : conversation [i + 1 ]["content" ],
356+ }
357+ conv .append (self .prompt_template .format (** args ))
358+ return conv
359+
360+ def pre_process (self , row ):
361+ formatted_queries = []
362+
363+ if "conversation_a" in row .columns :
364+ conv_a = self .conv_pre_process_helper (row ["conversation_a" ].iloc [0 ])
365+ formatted_queries .extend (conv_a )
366+
367+ if "conversation_b" in row .columns :
368+ conv_b = self .conv_pre_process_helper (row ["conversation_b" ].iloc [0 ])
369+ formatted_queries .extend (conv_b )
370+
371+ to_label = []
372+ for query in formatted_queries :
373+ system = {"role" : "system" , "content" : self .system_prompt }
374+ user = {"role" : "user" , "content" : query }
375+ to_label .append ([system , user ])
376+
377+ # print(to_label)
378+ return to_label , row ["uid" ].iloc [0 ]
379+
380+ def post_process (self , judgments , uid ):
381+ raw_output = {uid : str (judgments )}
382+
383+ scores = [self .get_score (judgment ) for judgment in judgments ]
384+ bool_score = [bool (score == "yes" ) if score else False for score in scores ]
385+ output = {uid : {"refusal" : any (bool_score ), "score" : str (scores )}}
386+
387+ return output , raw_output
388+
389+
300390class CategoryRefusalHF (CategoryHF ):
301391 def __init__ (self ):
302392 super ().__init__ ()
0 commit comments