@@ -375,7 +375,6 @@ def to_red_team_result(
375375 output_items = ordered_output_items ,
376376 eval_run = eval_run ,
377377 red_team_info = red_team_info ,
378- include_conversations = False ,
379378 scan_name = scan_name ,
380379 run_id_override = run_id_override ,
381380 eval_id_override = eval_id_override ,
@@ -413,7 +412,7 @@ def _build_output_item(
413412 results = self ._build_output_result (
414413 conversation ,
415414 eval_row ,
416- sample_payload = sample_payload ,
415+ sample_payload = None ,
417416 )
418417 output_item_id = self ._resolve_output_item_id (
419418 eval_row , datasource_item_id , conversation_key , conversation_index
@@ -431,6 +430,7 @@ def _build_output_item(
431430 "id" : output_item_id ,
432431 "created_time" : created_time ,
433432 "status" : status ,
433+ "sample" : sample_payload ,
434434 "results" : results ,
435435 }
436436
@@ -584,6 +584,7 @@ def _build_output_result(
584584 "name" : risk_value ,
585585 "metric" : risk_value ,
586586 "passed" : passed ,
587+ "label" : "pass" if passed is True else ("fail" if passed is False else None ),
587588 "score" : score ,
588589 "threshold" : threshold ,
589590 "reason" : reason ,
@@ -592,9 +593,6 @@ def _build_output_result(
592593 if properties :
593594 result_entry ["properties" ] = properties
594595
595- if sample_payload :
596- result_entry ["sample" ] = sample_payload
597-
598596 results .append (result_entry )
599597
600598 if not results :
@@ -624,6 +622,7 @@ def _build_output_result(
624622 "name" : risk_value ,
625623 "metric" : risk_value ,
626624 "passed" : None ,
625+ "label" : None ,
627626 "score" : None ,
628627 "threshold" : attack_threshold ,
629628 "reason" : fallback_reason ,
@@ -632,9 +631,6 @@ def _build_output_result(
632631 if properties :
633632 fallback_result ["properties" ] = properties
634633
635- if sample_payload :
636- fallback_result ["sample" ] = sample_payload
637-
638634 results .append (fallback_result )
639635
640636 return results
@@ -1096,9 +1092,12 @@ def _compute_result_count(output_items: List[Dict[str, Any]]) -> Dict[str, int]:
10961092
10971093 @staticmethod
10981094 def _compute_per_testing_criteria (output_items : List [Dict [str , Any ]]) -> List [Dict [str , Any ]]:
1099- """Build aggregated pass/fail counts per testing criteria (risk category)."""
1095+ """Build aggregated pass/fail counts per testing criteria (risk category and attack strategy )."""
11001096
1097+ # Track by risk category (testing_criteria)
11011098 criteria : Dict [str , Dict [str , int ]] = {}
1099+ # Track by attack strategy
1100+ strategy_criteria : Dict [str , Dict [str , int ]] = {}
11021101
11031102 for item in output_items :
11041103 for result in item .get ("results" , []):
@@ -1111,13 +1110,28 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
11111110 if passed_value is None :
11121111 continue
11131112
1113+ # Track by risk category
11141114 bucket = criteria .setdefault (str (name ), {"passed" : 0 , "failed" : 0 })
11151115 if passed_value :
11161116 bucket ["passed" ] += 1
11171117 else :
11181118 bucket ["failed" ] += 1
11191119
1120- return [
1120+ # Track by attack strategy from properties
1121+ properties = result .get ("properties" , {})
1122+ if isinstance (properties , dict ):
1123+ attack_technique = properties .get ("attack_technique" )
1124+ if attack_technique :
1125+ strategy_bucket = strategy_criteria .setdefault (
1126+ str (attack_technique ), {"passed" : 0 , "failed" : 0 }
1127+ )
1128+ if passed_value :
1129+ strategy_bucket ["passed" ] += 1
1130+ else :
1131+ strategy_bucket ["failed" ] += 1
1132+
1133+ # Build results list with risk categories
1134+ results = [
11211135 {
11221136 "testing_criteria" : criteria_name ,
11231137 "passed" : counts ["passed" ],
@@ -1126,6 +1140,19 @@ def _compute_per_testing_criteria(output_items: List[Dict[str, Any]]) -> List[Di
11261140 for criteria_name , counts in sorted (criteria .items ())
11271141 ]
11281142
1143+ # Add attack strategy summaries
1144+ for strategy_name , counts in sorted (strategy_criteria .items ()):
1145+ results .append (
1146+ {
1147+ "testing_criteria" : strategy_name ,
1148+ "attack_strategy" : strategy_name ,
1149+ "passed" : counts ["passed" ],
1150+ "failed" : counts ["failed" ],
1151+ }
1152+ )
1153+
1154+ return results
1155+
11291156 @staticmethod
11301157 def _build_data_source_section (parameters : Dict [str , Any ], red_team_info : Optional [Dict ]) -> Dict [str , Any ]:
11311158 """Build the data_source portion of the run payload for red-team scans."""
@@ -1179,7 +1206,6 @@ def _build_results_payload(
11791206 output_items : List [Dict [str , Any ]],
11801207 eval_run : Optional [Any ] = None ,
11811208 red_team_info : Optional [Dict ] = None ,
1182- include_conversations : bool = False ,
11831209 scan_name : Optional [str ] = None ,
11841210 run_id_override : Optional [str ] = None ,
11851211 eval_id_override : Optional [str ] = None ,
@@ -1191,7 +1217,6 @@ def _build_results_payload(
11911217 :param output_items: List of output items containing results for each conversation
11921218 :param eval_run: The MLFlow run object (optional)
11931219 :param red_team_info: Red team tracking information (optional)
1194- :param include_conversations: Whether to include conversation details (optional)
11951220 :param scan_name: Name of the scan (optional)
11961221 :param run_id_override: Override for run ID (optional)
11971222 :param eval_id_override: Override for eval ID (optional)
@@ -1290,7 +1315,4 @@ def _build_results_payload(
12901315 "output_items" : list_wrapper ,
12911316 }
12921317
1293- if include_conversations :
1294- run_payload ["conversations" ] = redteam_result .attack_details or scan_result .get ("attack_details" ) or []
1295-
12961318 return run_payload
0 commit comments