Skip to content

Commit d4ab934

Browse files
committed
♻️ Remove llama-guard category output as separate label in content type
Signed-off-by: Gaurav-Kumbhat <Gaurav.Kumbhat@ibm.com>
1 parent 6adb920 commit d4ab934

File tree

2 files changed

+11
-28
lines changed

2 files changed

+11
-28
lines changed

tests/generative_detectors/test_llama_guard.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,7 @@ def test_post_process_content_splits_unsafe_categories(llama_guard_detection):
245245
assert isinstance(responses, ChatCompletionResponse)
246246
assert responses.choices[0].message.content == "unsafe"
247247
assert scores[0] == unsafe_score
248-
assert responses.choices[1].message.content == "S2"
249-
# NOTE: currently we expect same score for each category
250-
assert scores[1] == unsafe_score
251-
assert responses.choices[2].message.content == "S3"
252-
assert scores[2] == unsafe_score
248+
assert len(responses.choices) == 1
253249

254250

255251
def test_post_process_content_works_for_safe(llama_guard_detection):

vllm_detector_adapter/generative_detectors/llama_guard.py

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Standard
22
from typing import Optional
33
import asyncio
4-
import copy
54

65
# Third Party
76
from fastapi import Request
@@ -45,33 +44,21 @@ def __post_process_result(self, responses, scores, detection_type):
4544
# this is guided by the prompt definition of the model, so we expect llama_guard to adhere to it
4645
# atleast for Llama-Guard-3 (latest at the time of writing)
4746

48-
# NOTE: The concept of "choice" doesn't exist for content type detector API, so
49-
# we will essentially flatten out the responses, so different categories in 1 choice
50-
# will also look like another choice.
47+
# In this function, we will basically remove those "safety" category from output and later on
48+
# move them to evidences.
5149

5250
new_choices = []
5351
new_scores = []
5452

53+
# NOTE: we are flattening out choices here as different categories
5554
for i, choice in enumerate(responses.choices):
5655
content = choice.message.content
5756
if self.UNSAFE_TOKEN in content:
58-
# We will create multiple results for each unsafe category
59-
# in addition to "unsafe" as a category itself
60-
# NOTE: need to deepcopy, otherwise, choice will get overwritten
61-
unsafe_choice = copy.deepcopy(choice)
62-
unsafe_choice.message.content = self.UNSAFE_TOKEN
63-
64-
new_choices.append(unsafe_choice)
57+
# Reason for reassigning the content:
58+
# We want to remove the safety category from the content
59+
choice.message.content = self.UNSAFE_TOKEN
60+
new_choices.append(choice)
6561
new_scores.append(scores[i])
66-
67-
# Fetch categories as the last line in the response available in csv format
68-
for category in content.splitlines()[-1].split(","):
69-
category_choice = copy.deepcopy(choice)
70-
category_choice.message.content = category
71-
new_choices.append(category_choice)
72-
# NOTE: currently using same score as "unsafe"
73-
# but we need to see if we can revisit this to get better score
74-
new_scores.append(scores[i])
7562
else:
7663
# "safe" case
7764
new_choices.append(choice)
@@ -122,16 +109,16 @@ async def content_analysis(
122109

123110
# If there is any error, return that otherwise, return the whole response
124111
# properly formatted.
125-
categorized_results = []
112+
processed_result = []
126113
for result in results:
127114
# NOTE: we are only sending 1 of the error results
128115
# and not every or not cumulative
129116
if isinstance(result, ErrorResponse):
130117
return result
131118
else:
132119
# Process results to split out safety categories into separate objects
133-
categorized_results.append(self.__post_process_result(*result))
120+
processed_result.append(self.__post_process_result(*result))
134121

135122
return ContentsDetectionResponse.from_chat_completion_response(
136-
categorized_results, request.contents
123+
processed_result, request.contents
137124
)

0 commit comments

Comments
 (0)