✨ Add llama-guard separate safety category in output

gkumbhat · gkumbhat · commit 865f3e9d7a46 · 2025-02-06T10:41:38.000-05:00
Signed-off-by: Gaurav-Kumbhat &lt;Gaurav.Kumbhat@ibm.com&gt;
diff --git a/vllm_detector_adapter/generative_detectors/base.py b/vllm_detector_adapter/generative_detectors/base.py
@@ -102,7 +102,7 @@ def apply_task_template(
         return request
 
     @detector_dispatcher(types=[DetectorType.TEXT_CHAT])
-    def preprocess_request( # noqa: F811
+    def preprocess_request(  # noqa: F811
         self, request: ChatDetectionRequest
     ) -> Union[ChatDetectionRequest, ErrorResponse]:
         """Preprocess chat request"""
@@ -112,7 +112,7 @@ def preprocess_request( # noqa: F811
     ##### Contents request processing functions ####################################
 
     @detector_dispatcher(types=[DetectorType.TEXT_CONTENT])
-    def preprocess_request( # noqa: F811
+    def preprocess_request(  # noqa: F811
         self, request: ContentsDetectionRequest
     ) -> Union[ContentsDetectionRequest, ErrorResponse]:
         """Preprocess contents request and convert it into appropriate chat request"""
diff --git a/vllm_detector_adapter/generative_detectors/llama_guard.py b/vllm_detector_adapter/generative_detectors/llama_guard.py
@@ -1,6 +1,20 @@
+# Standard
+from typing import Optional
+import asyncio
+import copy
+
+# Third Party
+from fastapi import Request
+from vllm.entrypoints.openai.protocol import ErrorResponse
+
 # Local
 from vllm_detector_adapter.generative_detectors.base import ChatCompletionDetectionBase
 from vllm_detector_adapter.logging import init_logger
+from vllm_detector_adapter.protocol import (
+    ContentsDetectionRequest,
+    ContentsDetectionResponse,
+)
+from vllm_detector_adapter.utils import DetectorType
 
 logger = init_logger(__name__)
 
@@ -13,5 +27,100 @@ class LlamaGuard(ChatCompletionDetectionBase):
     SAFE_TOKEN = "safe"
     UNSAFE_TOKEN = "unsafe"
 
-    # NOTE: More intelligent template parsing can be done here, potentially
-    # as a regex template for safe vs. unsafe and the 'unsafe' category
+    def __post_process_results(self, results):
+        # NOTE: Llama-guard returns specific safety categories in the last line and in a csv format
+        # this is guided by the prompt definition of the model, so we expect llama_guard to adhere to it
+        # atleast for Llama-Guard-3 (latest at the time of writing)
+
+        # NOTE: The concept of "choice" doesn't exist for content type detector API, so
+        # we will essentially flatten out the responses, so different categories in 1 choice
+        # will also look like another choice.
+
+        (responses, scores, detection_type) = results
+
+        new_choices = []
+        new_scores = []
+
+        for i, choice in enumerate(responses.choices):
+            content = choice.message.content
+            if self.UNSAFE_TOKEN in content:
+                # We will create multiple results for each unsafe category
+                # in addition to "unsafe" as a category itself
+                # NOTE: need to deepcopy, otherwise, choice will get overwritten
+                unsafe_choice = copy.deepcopy(choice)
+                unsafe_choice.message.content = self.UNSAFE_TOKEN
+
+                new_choices.append(unsafe_choice)
+                new_scores.append(scores[i])
+
+                # Fetch categories as the last line in the response available in csv format
+                for category in content.strip().split("\n")[-1].split(","):
+                    category_choice = copy.deepcopy(choice)
+                    category_choice.message.content = category
+                    new_choices.append(category_choice)
+                    # NOTE: currently using same score as "unsafe"
+                    # but we need to see if we can revisit this to get better score
+                    new_scores.append(scores[i])
+            else:
+                # "safe" case
+                new_choices.append(choice)
+                new_scores.append(scores[i])
+
+        responses.choices = new_choices
+        return (responses, new_scores, detection_type)
+
+    async def content_analysis(
+        self,
+        request: ContentsDetectionRequest,
+        raw_request: Optional[Request] = None,
+    ):
+        """Function used to call chat detection and provide a /text/contents response"""
+
+        # Apply task template if it exists
+        if self.task_template:
+            request = self.apply_task_template(
+                request, fn_type=DetectorType.TEXT_CONTENT
+            )
+            if isinstance(request, ErrorResponse):
+                # Propagate any request problems that will not allow
+                # task template to be applied
+                return request
+
+        # Since separate batch processing function doesn't exist at the time of writing,
+        # we are just going to collect all the text from content request and fire up
+        # separate requests and wait asynchronously.
+        # This mirrors how batching is handled in run_batch function in entrypoints/openai/
+        # in vLLM codebase.
+        completion_requests = self.preprocess_request(
+            request, fn_type=DetectorType.TEXT_CONTENT
+        )
+
+        # Send all the completion requests asynchronously.
+        tasks = [
+            asyncio.create_task(
+                self.process_chat_completion_with_scores(
+                    completion_request, raw_request
+                )
+            )
+            for completion_request in completion_requests
+        ]
+
+        # Gather all the results
+        # NOTE: The results are guaranteed to be in order of requests
+        results = await asyncio.gather(*tasks)
+
+        # If there is any error, return that otherwise, return the whole response
+        # properly formatted.
+        categorized_results = []
+        for result in results:
+            # NOTE: we are only sending 1 of the error results
+            # and not every or not cumulative
+            if isinstance(result, ErrorResponse):
+                return result
+            else:
+                # Process results to split out safety categories into separate objects
+                categorized_results.append(self.__post_process_results(result))
+
+        return ContentsDetectionResponse.from_chat_completion_response(
+            categorized_results, request.contents
+        )
diff --git a/vllm_detector_adapter/protocol.py b/vllm_detector_adapter/protocol.py
@@ -48,10 +48,7 @@ class ContentsDetectionResponse(RootModel):
     root: List[List[ContentsDetectionResponseObject]]
 
     @staticmethod
-    def from_chat_completion_response(
-        results,
-        # responses: ChatCompletionResponse, scores: List[float], detection_type: str
-    ):
+    def from_chat_completion_response(results, contents: List[str]):
         """Function to convert openai chat completion response to [fms] contents detection response
 
         Args:
@@ -63,23 +60,23 @@ def from_chat_completion_response(
         """
         contents_detection_responses = []
 
-        for (responses, scores, detection_type) in results:
+        for content_idx, (responses, scores, detection_type) in enumerate(results):
 
             detection_responses = []
             for i, choice in enumerate(responses.choices):
                 content = choice.message.content
                 # NOTE: for providing spans, we currently consider entire generated text as a span.
                 # This is because, at the time of writing, the generative guardrail models does not
-                # provide spefific information about text, which can be used to deduce spans.
+                # provide spefific information about input text, which can be used to deduce spans.
                 start = 0
-                end = len(content)
+                end = len(contents[content_idx])
                 if content and isinstance(content, str):
                     response_object = ContentsDetectionResponseObject(
                         detection_type=detection_type,
                         detection=content.strip(),
                         start=start,
                         end=end,
-                        text=content,
+                        text=contents[content_idx],
                         score=scores[i],
                     ).model_dump()
                     detection_responses.append(response_object)
@@ -94,8 +91,6 @@ def from_chat_completion_response(
                         type="BadRequestError",
                         code=HTTPStatus.BAD_REQUEST.value,
                     )
-
-            # return ContentsDetectionResponse(root=detection_responses)
             contents_detection_responses.append(detection_responses)
 
         return ContentsDetectionResponse(root=contents_detection_responses)