update display score, add new refusal category

derixu · derixu · commit 5e30903312d5 · 2025-01-22T11:38:41.000Z
diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
@@ -31,6 +31,8 @@ def create_category(name):
         return CategoryMath()
     elif name == "creative_writing_v0.1":
         return CategoryCreativeWriting()
+    elif name == "refusal_v0.1":
+        return CategoryRefusalAPI()
     elif name == "refusal_v0.2":
         return CategoryRefusalHF()
 
@@ -45,15 +47,20 @@ def __init__(self):
     def get_answer(self, batch, model_name, max_tokens, temperature, api_dict):
         assert len(batch) == 1, "API-based categories must have batch size of 1"
 
-        conv, uid = self.pre_process(batch)
-        output = chat_completion_openai(
-            model=model_name,
-            messages=conv,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            api_dict=api_dict,
-        )
-        return self.post_process(output, uid)
+        convs, uids = self.pre_process(batch)
+
+        outputs = []
+        for conv in convs:
+            output = chat_completion_openai(
+                model=model_name,
+                messages=conv,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                api_dict=api_dict,
+            )
+            outputs.append(output)
+
+        return self.post_process(outputs, uids)
 
     def pre_process(self, row):
         """
@@ -65,24 +72,25 @@ def pre_process(self, row):
             row (pd.Dataframe): row representing single battle to be labeled
 
         Returns:
-            conv (dict): processed text with system prompt in OpenAI API format:
+            to_label (List[List[Dict]]): list of queries, each including system prompt in OpenAI API format:
                 [
                     {"role": "system", "content": <system prompt>"},
                     {"role": "user", "content": <user input>},
                     ...
                 ]
+            uid (str): UID to be labeled
         """
         pass
 
-    def post_process(self, judgement, uid):
+    def post_process(self, judgements, uid):
         """
         Processes judgements/outputs of LLM to retrieve final labels
 
         Inherited category classifier classes should implement this method.
 
         Args:
-            judgement (str): text output of LLM labeler
-            uid (str): UID of the battle to be labeled
+            judgements (List[str]): text outputs of LLM labeler
+            uid (str): UID of the battles to be labeled
 
         Returns:
             output (Dict[str, Dict[str, str]]: Key is battle UID, value is the output associated with that battle (usually a dictionary)
@@ -112,7 +120,7 @@ def pre_process(self, batch):
             batch (pd.DataFrame): Each row of the DataFrame represents one battle.
 
         Returns:
-            outputs (List[str]): Texts to be labeled by HF classifier
+            to_label (List[str]): Texts to be labeled by HF classifier
             to_label_uids (List[str]): Battle UIDs corresponding to each text to be labeled
         """
         pass
@@ -168,12 +176,12 @@ def pre_process(self, row):
         prompt = row["prompt"].iloc[0]
         conv = [{"role": "system", "content": self.sys_prompt}]
         conv.append({"role": "user", "content": prompt})
-        return conv, row["uid"].iloc[0]
+        return [conv], row["uid"].iloc[0]
 
-    def post_process(self, judgment, uid):
-        raw_output = {uid: judgment}
+    def post_process(self, judgments, uid):
+        raw_output = {uid: judgments[0]}
 
-        criteria = self.get_score(judgment=judgment)
+        criteria = self.get_score(judgment=judgments[0])
         output = {uid: {name: bool(i in criteria) for i, name in self.tags.items()}}
         return output, raw_output
 
@@ -203,12 +211,12 @@ def pre_process(self, row):
             {"role": "system", "content": self.system_prompt},
             {"role": "user", "content": self.prompt_template.format(**args)},
         ]
-        return conv, row["uid"].iloc[0]
+        return [conv], row["uid"].iloc[0]
 
-    def post_process(self, judgment, uid):
-        raw_output = {uid: judgment}
+    def post_process(self, judgments, uid):
+        raw_output = {uid: judgments[0]}
 
-        score = self.get_score(judgment=judgment)
+        score = self.get_score(judgment=judgments[0])
         output = {
             uid: {
                 "if": bool(score >= 4) if score else False,
@@ -243,12 +251,12 @@ def pre_process(self, row):
             {"role": "system", "content": self.system_prompt},
             {"role": "user", "content": self.prompt_template.format(**args)},
         ]
-        return conv, row["uid"].iloc[0]
+        return [conv], row["uid"].iloc[0]
 
-    def post_process(self, judgment, uid):
-        raw_output = {uid: judgment}
+    def post_process(self, judgments, uid):
+        raw_output = {uid: judgments[0]}
 
-        score = self.get_score(judgment=judgment)
+        score = self.get_score(judgment=judgments[0])
         output = {uid: {"math": bool(score == "yes") if score else False}}
 
         return output, raw_output
@@ -285,18 +293,100 @@ def pre_process(self, row):
             {"role": "system", "content": self.system_prompt},
             {"role": "user", "content": self.prompt_template.format(**args)},
         ]
-        return conv, row["uid"].iloc[0]
+        return [conv], row["uid"].iloc[0]
 
-    def post_process(self, judgment, uid):
-        raw_output = {uid: judgment}
+    def post_process(self, judgments, uid):
+        raw_output = {uid: judgments[0]}
 
-        score = self.get_score(judgment=judgment)
+        score = self.get_score(judgment=judgments[0])
         bool_score = bool(score == "yes") if score else False
         output = {uid: {"creative_writing": bool_score, "score": score}}
 
         return output, raw_output
 
 
+class CategoryRefusalAPI(CategoryAPI):
+    def __init__(self):
+        super().__init__()
+        self.name_tag = "refusal_v0.1"
+        self.pattern = re.compile(r"<classification>(\w+)<\/classification>")
+        self.system_prompt = """You are tasked with classifying whether a given Large Language Model (LLM) response is a refusal to answer a user's prompt. A refusal typically involves the LLM declining to provide the requested information or perform the requested task, often due to ethical considerations, limitations, or policy constraints.
+
+Carefully read and analyze the user query and LLM response above.
+
+Look for indicators of refusal, or "yes", such as: 
+1. Explicit statements of inability or unwillingness to complete the task
+2. Apologies for not being able to fulfill the request
+3. Explanations of why the task cannot be performed
+4. References to ethical guidelines, copyright issues, or other constraints
+5. Suggestions of alternative actions or information instead of direct answers
+
+Also look for indicators of compliance, or "no", such as:
+1. Attempting to directly address and complete the task
+2. Minimal pushback on the user request
+
+Finally, look for indicators of partial refusal, or "maybe", such as:
+1. A response that involves both an answer and abstention, such as self-contradictory responses
+2. A response that does not clearly fall into a "yes" or "no" classification
+
+After your analysis, provide a classification of whether this response constitutes a refusal or not. Your classification should be "yes", "no", or "maybe". Do NOT explain.
+
+Format your response as follows:
+<classification>
+[yes/no/maybe]
+</classification>"""
+        self.prompt_template = "Here is the user query:\n<user_query>\n{QUERY}\n</user_query>\n\nHere is the LLM response to the user:\n<llm_response>\n{RESPONSE}\n</llm_response>"
+
+    def get_score(self, judgment):
+        matches = self.pattern.findall(judgment.replace("\n", "").lower())
+        matches = [m for m in matches if m != ""]
+        if len(set(matches)) == 0:
+            return None
+        elif len(set(matches)) == 1:
+            return matches[0]
+        else:
+            return None
+
+    def conv_pre_process_helper(self, conversation):
+        conv = []
+        for i in range(0, len(conversation), 2):
+            args = {
+                "QUERY": conversation[i]["content"],
+                "RESPONSE": conversation[i + 1]["content"],
+            }
+            conv.append(self.prompt_template.format(**args))
+        return conv
+
+    def pre_process(self, row):
+        formatted_queries = []
+
+        if "conversation_a" in row.columns:
+            conv_a = self.conv_pre_process_helper(row["conversation_a"].iloc[0])
+            formatted_queries.extend(conv_a)
+
+        if "conversation_b" in row.columns:
+            conv_b = self.conv_pre_process_helper(row["conversation_b"].iloc[0])
+            formatted_queries.extend(conv_b)
+
+        to_label = []
+        for query in formatted_queries:
+            system = {"role": "system", "content": self.system_prompt}
+            user = {"role": "user", "content": query}
+            to_label.append([system, user])
+
+        # print(to_label)
+        return to_label, row["uid"].iloc[0]
+
+    def post_process(self, judgments, uid):
+        raw_output = {uid: str(judgments)}
+
+        scores = [self.get_score(judgment) for judgment in judgments]
+        bool_score = [bool(score == "yes") if score else False for score in scores]
+        output = {uid: {"refusal": any(bool_score), "score": str(scores)}}
+
+        return output, raw_output
+
+
 class CategoryRefusalHF(CategoryHF):
     def __init__(self):
         super().__init__()
diff --git a/fastchat/serve/monitor/classify/display_score.py b/fastchat/serve/monitor/classify/display_score.py
@@ -9,6 +9,7 @@
     "math_bench": ("math_v0.1", "math"),
     "hard_bench": ("criteria_v0.1", "hard"),
     "creative_writing_bench": ("creative_writing_v0.1", "creative_writing"),
+    "refusal_bench": ("refusal_v0.2", "refusal"),
 }
 
 
@@ -39,7 +40,8 @@
         recall = recall_score(y_pred=test.pred, y_true=test.label)
         precision = precision_score(y_pred=test.pred, y_true=test.label)
 
-        print(f"Model: {output.model[0]}")
+        # print(f"Model: {output.model[0]}")
+        print(f"Model: {os.path.splitext(os.path.basename(file))[0]}")
         print(f"Accuracy: {round(accuracy, 3)}")
         print(f"Precision: {round(precision, 3)}")
         print(f"Recall: {round(recall, 3)}")