FIX/FEAT: Enable multi-modal pieces for SelfAskTrueFalseScorer scoring (Azure#1287)

jsong468 · web-flow · commit f525ee2c3f79 · 2025-12-29T10:23:04.000-08:00
diff --git a/doc/code/scoring/scorer_evals.ipynb b/doc/code/scoring/scorer_evals.ipynb
@@ -38,8 +38,7 @@
     ")\n",
     "from pyrit.setup import IN_MEMORY, initialize_pyrit_async\n",
     "\n",
-    "await initialize_pyrit_async(memory_db_type=IN_MEMORY)  # type: ignore\n",
-    "target = OpenAIChatTarget()"
+    "await initialize_pyrit_async(memory_db_type=IN_MEMORY)  # type: ignore"
    ]
   },
   {
@@ -123,7 +122,14 @@
     }
    ],
    "source": [
-    "target = OpenAIChatTarget()\n",
+    "import os\n",
+    "\n",
+    "# Use unsafe endpoint ideally since evaluation dataset may include harmful content\n",
+    "target = OpenAIChatTarget(\n",
+    "    endpoint=os.environ[\"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT\"],\n",
+    "    api_key=os.environ[\"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY\"],\n",
+    "    model_name=os.environ[\"AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL\"],\n",
+    ")\n",
     "likert_scorer = SelfAskLikertScorer(chat_target=target, likert_scale_path=LikertScalePaths.HATE_SPEECH_SCALE.value)\n",
     "\n",
     "# factory method that creates an HarmScorerEvaluator in this case since metrics_type is HARM.\n",
diff --git a/doc/code/scoring/scorer_evals.py b/doc/code/scoring/scorer_evals.py
@@ -5,11 +5,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.18.1
-#   kernelspec:
-#     display_name: pyrit2
-#     language: python
-#     name: python3
+#       jupytext_version: 1.17.2
 # ---
 
 # %% [markdown]
@@ -40,7 +36,6 @@
 from pyrit.setup import IN_MEMORY, initialize_pyrit_async
 
 await initialize_pyrit_async(memory_db_type=IN_MEMORY)  # type: ignore
-target = OpenAIChatTarget()
 
 # %% [markdown]
 # ## Running Harm Scorer Evaluation
@@ -80,7 +75,14 @@
 # With multiple evaluators, we can measure inter-reliability alignment between evaluators shown below:
 
 # %%
-target = OpenAIChatTarget()
+import os
+
+# Use unsafe endpoint ideally since evaluation dataset may include harmful content
+target = OpenAIChatTarget(
+    endpoint=os.environ["AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"],
+    api_key=os.environ["AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"],
+    model_name=os.environ["AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"],
+)
 likert_scorer = SelfAskLikertScorer(chat_target=target, likert_scale_path=LikertScalePaths.HATE_SPEECH_SCALE.value)
 
 # factory method that creates an HarmScorerEvaluator in this case since metrics_type is HARM.
diff --git a/pyrit/score/float_scale/self_ask_scale_scorer.py b/pyrit/score/float_scale/self_ask_scale_scorer.py
@@ -87,6 +87,7 @@ def _build_scorer_identifier(self) -> None:
         """Build the scorer evaluation identifier for this scorer."""
         self._set_scorer_identifier(
             system_prompt_template=self._system_prompt,
+            user_prompt_template="objective: {objective}\nresponse: {response}",
             prompt_target=self._prompt_target,
         )
 
diff --git a/pyrit/score/scorer.py b/pyrit/score/scorer.py
@@ -460,6 +460,7 @@ async def _score_value_with_llm(
         message_value: str,
         message_data_type: PromptDataType,
         scored_prompt_id: str,
+        prepended_text_message_piece: Optional[str] = None,
         category: Optional[Sequence[str] | str] = None,
         objective: Optional[str] = None,
         score_value_output_key: str = "score_value",
@@ -478,9 +479,15 @@ async def _score_value_with_llm(
         Args:
             prompt_target (PromptChatTarget): The target LLM to send the message to.
             system_prompt (str): The system-level prompt that guides the behavior of the target LLM.
-            message_value (str): The actual value or content to be scored by the LLM.
-            message_data_type (PromptDataType): The type of the data being sent in the message.
+            message_value (str): The actual value or content to be scored by the LLM (e.g., text, image path,
+                audio path).
+            message_data_type (PromptDataType): The type of the data being sent in the message (e.g., "text",
+                "image_path", "audio_path").
             scored_prompt_id (str): The ID of the scored prompt.
+            prepended_text_message_piece (Optional[str]): Text context to prepend before the main
+                message_value. When provided, creates a multi-piece message with this text first, followed
+                by the message_value. Useful for adding objective/context when scoring non-text content.
+                Defaults to None.
             category (Optional[Sequence[str] | str]): The category of the score. Can also be parsed from
                 the JSON response if not provided. Defaults to None.
             objective (Optional[str]): A description of the objective that is associated with the score,
@@ -518,19 +525,38 @@ async def _score_value_with_llm(
             attack_identifier=attack_identifier,
         )
         prompt_metadata: dict[str, str | int] = {"response_format": "json"}
-        scorer_llm_request = Message(
-            [
+
+        # Build message pieces - prepended text context first (if provided), then the main message being scored
+        message_pieces: list[MessagePiece] = []
+
+        # Add prepended text context piece if provided (e.g., objective context for non-text scoring)
+        if prepended_text_message_piece:
+            message_pieces.append(
                 MessagePiece(
                     role="user",
-                    original_value=message_value,
-                    original_value_data_type=message_data_type,
-                    converted_value_data_type=message_data_type,
+                    original_value=prepended_text_message_piece,
+                    original_value_data_type="text",
+                    converted_value_data_type="text",
                     conversation_id=conversation_id,
                     prompt_target_identifier=prompt_target.get_identifier(),
                     prompt_metadata=prompt_metadata,
                 )
-            ]
+            )
+
+        # Add the main message piece being scored
+        message_pieces.append(
+            MessagePiece(
+                role="user",
+                original_value=message_value,
+                original_value_data_type=message_data_type,
+                converted_value_data_type=message_data_type,
+                conversation_id=conversation_id,
+                prompt_target_identifier=prompt_target.get_identifier(),
+                prompt_metadata=prompt_metadata,
+            )
         )
+
+        scorer_llm_request = Message(message_pieces)
         try:
             response = await prompt_target.send_prompt_async(message=scorer_llm_request)
         except Exception as ex:
diff --git a/pyrit/score/true_false/self_ask_true_false_scorer.py b/pyrit/score/true_false/self_ask_true_false_scorer.py
@@ -9,7 +9,7 @@
 
 from pyrit.common import verify_and_resolve_path
 from pyrit.common.path import SCORER_SEED_PROMPT_PATH
-from pyrit.models import MessagePiece, Score, SeedPrompt, UnvalidatedScore
+from pyrit.models import MessagePiece, Score, SeedPrompt
 from pyrit.prompt_target import PromptChatTarget
 from pyrit.score.scorer_prompt_validator import ScorerPromptValidator
 from pyrit.score.true_false.true_false_score_aggregator import (
@@ -150,6 +150,7 @@ def _build_scorer_identifier(self) -> None:
         """Build the scorer evaluation identifier for this scorer."""
         self._set_scorer_identifier(
             system_prompt_template=self._system_prompt,
+            user_prompt_template="objective: {objective}\nresponse: {response}",
             prompt_target=self._prompt_target,
             score_aggregator=self._score_aggregator.__name__,
         )
@@ -169,14 +170,24 @@ async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Op
                 The score_value is True or False based on which description fits best.
                 Metadata can be configured to provide additional information.
         """
-        scoring_prompt = f"objective: {objective}\nresponse: {message_piece.converted_value}"
-
-        unvalidated_score: UnvalidatedScore = await self._score_value_with_llm(
+        # Build scoring prompt - for non-text content, extra context about objective is sent as a prepended text piece
+        is_non_text = message_piece.converted_value_data_type != "text"
+        if is_non_text:
+            prepended_text = f"objective: {objective}\nresponse:"
+            scoring_value = message_piece.converted_value
+            scoring_data_type = message_piece.converted_value_data_type
+        else:
+            prepended_text = None
+            scoring_value = f"objective: {objective}\nresponse: {message_piece.converted_value}"
+            scoring_data_type = "text"
+
+        unvalidated_score = await self._score_value_with_llm(
             prompt_target=self._prompt_target,
             system_prompt=self._system_prompt,
-            message_value=scoring_prompt,
-            message_data_type=message_piece.converted_value_data_type,
+            message_value=scoring_value,
+            message_data_type=scoring_data_type,
             scored_prompt_id=message_piece.id,
+            prepended_text_message_piece=prepended_text,
             category=self._score_category,
             objective=objective,
             attack_identifier=message_piece.attack_identifier,
diff --git a/tests/unit/score/test_scorer.py b/tests/unit/score/test_scorer.py
@@ -303,6 +303,123 @@ async def test_scorer_remove_markdown_json_called(good_json):
         mock_remove_markdown_json.assert_called_once()
 
 
+@pytest.mark.asyncio
+async def test_score_value_with_llm_prepended_text_message_piece_creates_multipiece_message(good_json):
+    """Test that prepended_text_message_piece creates a multi-piece message (text context + main content)."""
+    chat_target = MagicMock(PromptChatTarget)
+    good_json_resp = Message(
+        message_pieces=[MessagePiece(role="assistant", original_value=good_json, conversation_id="test-convo")]
+    )
+    chat_target.send_prompt_async = AsyncMock(return_value=[good_json_resp])
+
+    scorer = MockScorer()
+
+    await scorer._score_value_with_llm(
+        prompt_target=chat_target,
+        system_prompt="system_prompt",
+        message_value="test_image.png",
+        message_data_type="image_path",
+        scored_prompt_id="123",
+        prepended_text_message_piece="objective: test\nresponse:",
+        category="category",
+        objective="task",
+    )
+
+    # Verify send_prompt_async was called
+    chat_target.send_prompt_async.assert_called_once()
+
+    # Get the message that was sent
+    call_args = chat_target.send_prompt_async.call_args
+    sent_message = call_args.kwargs["message"]
+
+    # Should have 2 pieces: text context first, then the main content being scored
+    assert len(sent_message.message_pieces) == 2
+
+    # First piece should be the extra text context
+    text_piece = sent_message.message_pieces[0]
+    assert text_piece.converted_value_data_type == "text"
+    assert "objective: test" in text_piece.original_value
+
+    # Second piece should be the main content (image in this case)
+    main_piece = sent_message.message_pieces[1]
+    assert main_piece.converted_value_data_type == "image_path"
+    assert main_piece.original_value == "test_image.png"
+
+
+@pytest.mark.asyncio
+async def test_score_value_with_llm_no_prepended_text_creates_single_piece_message(good_json):
+    """Test that without prepended_text_message_piece, only a single piece message is created."""
+    chat_target = MagicMock(PromptChatTarget)
+    good_json_resp = Message(
+        message_pieces=[MessagePiece(role="assistant", original_value=good_json, conversation_id="test-convo")]
+    )
+    chat_target.send_prompt_async = AsyncMock(return_value=[good_json_resp])
+
+    scorer = MockScorer()
+
+    await scorer._score_value_with_llm(
+        prompt_target=chat_target,
+        system_prompt="system_prompt",
+        message_value="objective: test\nresponse: some text",
+        message_data_type="text",
+        scored_prompt_id="123",
+        category="category",
+        objective="task",
+    )
+
+    # Get the message that was sent
+    call_args = chat_target.send_prompt_async.call_args
+    sent_message = call_args.kwargs["message"]
+
+    # Should have only 1 piece
+    assert len(sent_message.message_pieces) == 1
+
+    # The piece should be text with the full message
+    text_piece = sent_message.message_pieces[0]
+    assert text_piece.converted_value_data_type == "text"
+    assert "objective: test" in text_piece.original_value
+    assert "response: some text" in text_piece.original_value
+
+
+@pytest.mark.asyncio
+async def test_score_value_with_llm_prepended_text_works_with_audio(good_json):
+    """Test that prepended_text_message_piece works with audio content (type-independent)."""
+    chat_target = MagicMock(PromptChatTarget)
+    good_json_resp = Message(
+        message_pieces=[MessagePiece(role="assistant", original_value=good_json, conversation_id="test-convo")]
+    )
+    chat_target.send_prompt_async = AsyncMock(return_value=[good_json_resp])
+
+    scorer = MockScorer()
+
+    await scorer._score_value_with_llm(
+        prompt_target=chat_target,
+        system_prompt="system_prompt",
+        message_value="test_audio.wav",
+        message_data_type="audio_path",
+        scored_prompt_id="123",
+        prepended_text_message_piece="objective: transcribe and evaluate\nresponse:",
+        category="category",
+        objective="task",
+    )
+
+    # Get the message that was sent
+    call_args = chat_target.send_prompt_async.call_args
+    sent_message = call_args.kwargs["message"]
+
+    # Should have 2 pieces: text context + audio
+    assert len(sent_message.message_pieces) == 2
+
+    # First piece should be text context
+    text_piece = sent_message.message_pieces[0]
+    assert text_piece.converted_value_data_type == "text"
+
+    # Second piece should be audio
+    audio_piece = sent_message.message_pieces[1]
+    assert audio_piece.converted_value_data_type == "audio_path"
+    assert audio_piece.original_value == "test_audio.wav"
+
+
 def test_scorer_extract_task_from_response(patch_central_database):
     """
     Test that _extract_task_from_response properly gathers text from the

Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,7 @@ def _build_scorer_identifier(self) -> None:`
`87`	`87`	`"""Build the scorer evaluation identifier for this scorer."""`
`88`	`88`	`self._set_scorer_identifier(`
`89`	`89`	`system_prompt_template=self._system_prompt,`
	`90`	`+ user_prompt_template="objective: {objective}\nresponse: {response}",`
`90`	`91`	`prompt_target=self._prompt_target,`
`91`	`92`	`)`
`92`	`93`