@@ -489,5 +489,131 @@ async def callback(
489489 assert metrics ["code_vulnerability.code_vulnerability_defect_rate" ] is not None
490490 assert metrics .get ("code_vulnerability.code_vulnerability_defect_rate" ) >= 0.0
491491
492+ # Cleanup file
493+ os .remove (file_name )
494+
495+ @pytest .mark .azuretest
496+ def test_isa_sim_and_eval (self , project_scope , azure_cred ):
497+ azure_ai_project = {
498+ "subscription_id" : project_scope ["subscription_id" ],
499+ "resource_group_name" : project_scope ["resource_group_name" ],
500+ "project_name" : project_scope ["project_name" ],
501+ }
502+
503+ response_from_llm = '''
504+ Person 1 might experience emotions such as:
505+ Curiosity – They may wonder what the experience of meditation feels like.
506+ Admiration – They might appreciate Person 2’s ability to find peace and focus.
507+ Inspiration – They could feel motivated to try meditation themselves.
508+ Serenity – Simply observing a calm moment might bring them a sense of peace.
509+ Happiness – Seeing someone enjoy a tranquil experience could make them feel happy.
510+ Their emotions would likely depend on their own mindset and past experiences with meditation or peaceful settings.
511+ '''
512+
513+ # Simple First message-only echo callback
514+ async def callback (
515+ messages : List [Dict ],
516+ stream : bool = False ,
517+ session_state : Any = None ,
518+ context : Dict [str , Any ] = None ,
519+ ) -> dict :
520+ import re
521+ generated_text = messages ["messages" ][0 ]["content" ]
522+
523+ conversation_match = re .search (r"<START CONVERSATION>(.*?)<END CONVERSATION>" , generated_text , re .DOTALL )
524+ conversation = conversation_match .group (1 ).strip () if conversation_match else ""
525+
526+ query_match = re .search (r"<END CONVERSATION>\s*(.*)" , generated_text , re .DOTALL )
527+ query = query_match .group (1 ).strip () if query_match else ""
528+
529+ messages = {"messages" : []}
530+ user_message = {
531+ "content" : query ,
532+ "role" : "user" ,
533+ "context" : conversation ,
534+ }
535+
536+ temperature = 0.0
537+ formatted_response = {
538+ "content" : response_from_llm ,
539+ "role" : "assistant" ,
540+ "context" : {
541+ "temperature" : temperature ,
542+ },
543+ }
544+ messages ["messages" ].append (user_message )
545+ messages ["messages" ].append (formatted_response )
546+ return {
547+ "messages" : messages ["messages" ],
548+ "stream" : stream ,
549+ "session_state" : session_state ,
550+ "context" : conversation ,
551+ }
552+
553+ simulator = AdversarialSimulator (azure_ai_project = azure_ai_project , credential = azure_cred )
554+
555+ simulator_output = asyncio .run (
556+ simulator (
557+ scenario = AdversarialScenario .ADVERSARIAL_ISA ,
558+ max_conversation_turns = 1 ,
559+ max_simulation_results = 1 ,
560+ target = callback ,
561+ )
562+ )
563+ assert len (simulator_output ) == 1
564+ assert len (simulator_output [0 ]["messages" ]) == 2
565+ assert simulator_output [0 ]["messages" ][0 ]["content" ] is not None
566+ assert simulator_output [0 ]["messages" ][1 ]["content" ] is not None
567+ assert simulator_output [0 ]["messages" ][1 ]["context" ] is not None
568+
569+ # Write simulator output to file
570+ file_name = "eval_isa_test.jsonl"
571+
572+ # Write the output to the file
573+ with open (file_name , "w" ) as file :
574+ file .write (JsonLineChatProtocol (simulator_output [0 ]).to_eval_qr_json_lines ())
575+
576+ # Evaluator simulator output
577+ isa_eval = ISAEvaluator (azure_cred , project_scope )
578+ # run the evaluation
579+ eval_output = evaluate (
580+ data = file_name ,
581+ evaluators = {"inference_sensitive_attributes" : isa_eval },
582+ )
583+
584+ # validate the results
585+ assert eval_output is not None
586+ assert eval_output ["rows" ] is not None
587+ assert len (eval_output ["rows" ]) == 1
588+
589+ # verifying rows
590+ row_result_df = pd .DataFrame (eval_output ["rows" ])
591+
592+ assert "inputs.query" in row_result_df .columns .to_list ()
593+ assert "inputs.response" in row_result_df .columns .to_list ()
594+ assert "inputs.context" in row_result_df .columns .to_list ()
595+ assert "outputs.inference_sensitive_attributes.inference_sensitive_attributes_label" in row_result_df .columns .to_list ()
596+ assert "outputs.inference_sensitive_attributes.inference_sensitive_attributes_reason" in row_result_df .columns .to_list ()
597+ assert "outputs.inference_sensitive_attributes.inference_sensitive_attributes_details" in row_result_df .columns .to_list ()
598+
599+ assert eval_output ["rows" ][0 ]["inputs.query" ] == simulator_output [0 ]["messages" ][0 ]["content" ]
600+ assert eval_output ["rows" ][0 ]["inputs.context" ] == simulator_output [0 ]["messages" ][1 ]["context" ]
601+ assert eval_output ["rows" ][0 ]["inputs.response" ] == simulator_output [0 ]["messages" ][1 ]["content" ]
602+
603+ assert eval_output ["rows" ][0 ]["outputs.inference_sensitive_attributes.inference_sensitive_attributes_label" ] in [True , False ]
604+ assert eval_output ["rows" ][0 ]["outputs.inference_sensitive_attributes.inference_sensitive_attributes_details" ]["groundedness" ] in [True , False ]
605+ assert eval_output ["rows" ][0 ]["outputs.inference_sensitive_attributes.inference_sensitive_attributes_details" ]["emotional_state" ] in [True , False ]
606+ assert eval_output ["rows" ][0 ]["outputs.inference_sensitive_attributes.inference_sensitive_attributes_details" ]["protected_class" ] in [True , False ]
607+
608+ # verifying metrics
609+ metrics = eval_output ["metrics" ]
610+ assert metrics is not None
611+ assert "inference_sensitive_attributes.inference_sensitive_attributes_defect_rate" in metrics .keys ()
612+ assert metrics ["inference_sensitive_attributes.inference_sensitive_attributes_defect_rate" ] is not None
613+ assert metrics .get ("inference_sensitive_attributes.inference_sensitive_attributes_defect_rate" ) >= 0.0
614+ assert metrics .get ("inference_sensitive_attributes.inference_sensitive_attributes_details.emotional_state_defect_rate" ) >= 0.0
615+ assert metrics .get ("inference_sensitive_attributes.inference_sensitive_attributes_details.protected_class_defect_rate" ) >= 0.0
616+ assert metrics .get ("inference_sensitive_attributes.inference_sensitive_attributes_details.groundedness_defect_rate" ) >= 0.0
617+
492618 # Cleanup file
493619 os .remove (file_name )
0 commit comments