@@ -668,3 +668,132 @@ def test_format_experiment_results_basic():
668668
669669 langfuse_client .flush ()
670670 time .sleep (1 )
671+
672+
673+ def test_boolean_score_types ():
674+ """Test that BOOLEAN score types are properly ingested and persisted."""
675+ from langfuse .api import ScoreDataType
676+
677+ langfuse_client = get_client ()
678+
679+ def boolean_evaluator (* , input , output , expected_output = None , ** kwargs ):
680+ """Boolean evaluator that checks if output contains the expected answer."""
681+ if not expected_output :
682+ return Evaluation (
683+ name = "has_expected_content" ,
684+ value = False ,
685+ data_type = ScoreDataType .BOOLEAN ,
686+ comment = "No expected output to check" ,
687+ )
688+
689+ contains_expected = expected_output .lower () in str (output ).lower ()
690+ return Evaluation (
691+ name = "has_expected_content" ,
692+ value = contains_expected ,
693+ data_type = ScoreDataType .BOOLEAN ,
694+ comment = f"Output { 'contains' if contains_expected else 'does not contain' } expected content" ,
695+ )
696+
697+ def boolean_run_evaluator (* , item_results : List [ExperimentItemResult ], ** kwargs ):
698+ """Run evaluator that returns boolean based on all items passing."""
699+ if not item_results :
700+ return Evaluation (
701+ name = "all_items_pass" ,
702+ value = False ,
703+ data_type = ScoreDataType .BOOLEAN ,
704+ comment = "No items to evaluate" ,
705+ )
706+
707+ # Check if all boolean evaluations are True
708+ all_pass = True
709+ for item_result in item_results :
710+ for evaluation in item_result .evaluations :
711+ if (
712+ evaluation .name == "has_expected_content"
713+ and evaluation .value is False
714+ ):
715+ all_pass = False
716+ break
717+ if not all_pass :
718+ break
719+
720+ return Evaluation (
721+ name = "all_items_pass" ,
722+ value = all_pass ,
723+ data_type = ScoreDataType .BOOLEAN ,
724+ comment = f"{ 'All' if all_pass else 'Not all' } items passed the boolean evaluation" ,
725+ )
726+
727+ # Test data where some items should pass and some should fail
728+ test_data = [
729+ {"input" : "What is the capital of Germany?" , "expected_output" : "Berlin" },
730+ {"input" : "What is the capital of France?" , "expected_output" : "Paris" },
731+ {"input" : "What is the capital of Spain?" , "expected_output" : "Madrid" },
732+ ]
733+
734+ # Task that returns correct answers for Germany and France, but wrong for Spain
735+ def mock_task_with_boolean_results (* , item : ExperimentItem , ** kwargs ):
736+ input_val = (
737+ item .get ("input" )
738+ if isinstance (item , dict )
739+ else getattr (item , "input" , "unknown" )
740+ )
741+ input_str = str (input_val ) if input_val is not None else ""
742+
743+ if "Germany" in input_str :
744+ return "The capital is Berlin"
745+ elif "France" in input_str :
746+ return "The capital is Paris"
747+ else :
748+ return "I don't know the capital"
749+
750+ result = langfuse_client .run_experiment (
751+ name = "Boolean score type test" ,
752+ description = "Test BOOLEAN data type in scores" ,
753+ data = test_data ,
754+ task = mock_task_with_boolean_results ,
755+ evaluators = [boolean_evaluator ],
756+ run_evaluators = [boolean_run_evaluator ],
757+ )
758+
759+ # Validate basic result structure
760+ assert len (result .item_results ) == 3
761+ assert len (result .run_evaluations ) == 1
762+
763+ # Validate individual item evaluations have boolean values
764+ expected_results = [
765+ True ,
766+ True ,
767+ False ,
768+ ] # Germany and France should pass, Spain should fail
769+ for i , item_result in enumerate (result .item_results ):
770+ assert len (item_result .evaluations ) == 1
771+ eval_result = item_result .evaluations [0 ]
772+ assert eval_result .name == "has_expected_content"
773+ assert isinstance (eval_result .value , bool )
774+ assert eval_result .value == expected_results [i ]
775+ assert eval_result .data_type == ScoreDataType .BOOLEAN
776+
777+ # Validate run evaluation is boolean and should be False (not all items passed)
778+ run_eval = result .run_evaluations [0 ]
779+ assert run_eval .name == "all_items_pass"
780+ assert isinstance (run_eval .value , bool )
781+ assert run_eval .value is False # Spain should fail, so not all pass
782+ assert run_eval .data_type == ScoreDataType .BOOLEAN
783+
784+ # Flush and wait for server processing
785+ langfuse_client .flush ()
786+ time .sleep (3 )
787+
788+ # Verify scores are persisted via API with correct data types
789+ api = get_api ()
790+ for i , item_result in enumerate (result .item_results ):
791+ trace_id = item_result .trace_id
792+ assert trace_id is not None , f"Item { i } should have a trace_id"
793+
794+ # Fetch trace from API to verify score persistence
795+ trace = api .trace .get (trace_id )
796+ assert trace is not None , f"Trace { trace_id } should exist"
797+
798+ for score in trace .scores :
799+ assert score .data_type == "BOOLEAN"
0 commit comments