@@ -634,16 +634,26 @@ def cluster_run(req: ClusterRunRequest) -> Dict[str, Any]:
634634 if not score_columns_to_use and req .operationalRows :
635635 import pandas as pd
636636 operational_df = pd .DataFrame (req .operationalRows )
637-
638- # Check if 'score' column already exists (nested dict format)
639- if 'score' in operational_df .columns :
637+
638+ # Check if 'score' or 'scores' column already exists (nested dict format)
639+ # Frontend may send either 'score' (singular) or 'scores' (plural)
640+ score_column_name = None
641+ if 'scores' in operational_df .columns :
642+ score_column_name = 'scores'
643+ elif 'score' in operational_df .columns :
644+ score_column_name = 'score'
645+
646+ if score_column_name :
640647 # Check if it's actually a dict (not a string or number)
641- sample_score = operational_df ['score' ].iloc [0 ] if len (operational_df ) > 0 else None
648+ sample_score = operational_df [score_column_name ].iloc [0 ] if len (operational_df ) > 0 else None
642649 if not isinstance (sample_score , dict ):
643- logger .info ("'score ' column exists but is not a dict - will attempt to detect score columns" )
650+ logger .info (f"' { score_column_name } ' column exists but is not a dict - will attempt to detect score columns" )
644651 else :
645- logger .info ("'score ' column already in nested dict format - no conversion needed" )
652+ logger .info (f"' { score_column_name } ' column already in nested dict format - no conversion needed" )
646653 score_columns_to_use = None
654+ # Normalize to 'score' for consistency
655+ if score_column_name == 'scores' :
656+ operational_df ['score' ] = operational_df ['scores' ]
647657 else :
648658 # Try to detect score columns based on naming patterns
649659 # Look for columns like: score_X, X_score, helpfulness, accuracy, etc.
@@ -669,7 +679,19 @@ def cluster_run(req: ClusterRunRequest) -> Dict[str, Any]:
669679 score_columns_to_use = potential_score_cols
670680 else :
671681 logger .info ("No score columns detected" )
672-
682+
683+ # If we normalized 'scores' to 'score', update req.operationalRows
684+ if score_column_name == 'scores' :
685+ logger .info ("🔄 Normalizing 'scores' column to 'score' for backend compatibility" )
686+ req .operationalRows = operational_df .to_dict ('records' )
687+ # Log sample after normalization
688+ if req .operationalRows :
689+ sample = req .operationalRows [0 ]
690+ logger .info (f" ✓ Sample after normalization:" )
691+ logger .info (f" - Has 'score' key: { 'score' in sample } " )
692+ logger .info (f" - Score value: { sample .get ('score' )} " )
693+ logger .info (f" - Score type: { type (sample .get ('score' ))} " )
694+
673695 # Convert score columns if needed
674696 if score_columns_to_use :
675697 logger .info (f"Converting score columns to dict format: { score_columns_to_use } " )
@@ -773,7 +795,11 @@ def cluster_run(req: ClusterRunRequest) -> Dict[str, Any]:
773795 logger .warning (f" Sample from operationalRows: question_id='{ req .operationalRows [0 ].get ('question_id' )} ' (type: { type (req .operationalRows [0 ].get ('question_id' ))} ), model='{ req .operationalRows [0 ].get ('model' )} ' (type: { type (req .operationalRows [0 ].get ('model' ))} )" )
774796
775797 # Create minimal conversation (use empty data if no matching row found)
776- scores = matching_row .get ("score" , {}) if matching_row else {}
798+ # Try both 'score' and 'scores' fields for compatibility
799+ if matching_row :
800+ scores = matching_row .get ("score" ) or matching_row .get ("scores" ) or {}
801+ else :
802+ scores = {}
777803
778804 conv = ConversationRecord (
779805 question_id = question_id ,
@@ -784,8 +810,21 @@ def cluster_run(req: ClusterRunRequest) -> Dict[str, Any]:
784810 meta = {}
785811 )
786812 conversations .append (conv )
787-
813+
788814 logger .info (f"✅ Matched { matches_found } /{ len (property_keys )} conversations with operationalRows" )
815+
816+ # Enhanced logging for debugging quality metrics
817+ if matches_found > 0 and conversations :
818+ # Log sample conversation scores
819+ sample_conv = conversations [0 ]
820+ logger .info (f"📊 Score field verification:" )
821+ logger .info (f" - Sample conversation has scores: { bool (sample_conv .scores )} " )
822+ logger .info (f" - Scores type: { type (sample_conv .scores )} " )
823+ logger .info (f" - Scores content: { sample_conv .scores } " )
824+ if isinstance (sample_conv .scores , dict ):
825+ logger .info (f" - Score keys: { list (sample_conv .scores .keys ())} " )
826+ else :
827+ logger .warning ("⚠️ No conversations matched with operationalRows - quality metrics will be empty!" )
789828
790829 # Create PropertyDataset with matching conversations and properties
791830 dataset = PropertyDataset (
@@ -895,7 +934,26 @@ def cluster_run(req: ClusterRunRequest) -> Dict[str, Any]:
895934 model_cluster_scores_dict = clustered_dataset .model_stats .get ("model_cluster_scores" , {})
896935 cluster_scores_dict = clustered_dataset .model_stats .get ("cluster_scores" , {})
897936 model_scores_dict = clustered_dataset .model_stats .get ("model_scores" , {})
898-
937+
938+ # Debug: Log what was extracted from model_stats
939+ logger .info (f"📈 After FunctionalMetrics computation:" )
940+ logger .info (f" - model_cluster_scores type: { type (model_cluster_scores_dict )} " )
941+ logger .info (f" - cluster_scores type: { type (cluster_scores_dict )} " )
942+ logger .info (f" - model_scores type: { type (model_scores_dict )} " )
943+
944+ if hasattr (model_cluster_scores_dict , 'shape' ):
945+ logger .info (f" - model_cluster_scores shape: { model_cluster_scores_dict .shape } " )
946+ logger .info (f" - model_cluster_scores columns: { list (model_cluster_scores_dict .columns )} " )
947+ if hasattr (cluster_scores_dict , 'shape' ):
948+ logger .info (f" - cluster_scores shape: { cluster_scores_dict .shape } " )
949+ logger .info (f" - cluster_scores columns: { list (cluster_scores_dict .columns )} " )
950+ if hasattr (model_scores_dict , 'shape' ):
951+ logger .info (f" - model_scores shape: { model_scores_dict .shape } " )
952+ logger .info (f" - model_scores columns: { list (model_scores_dict .columns )} " )
953+ # Check if quality columns exist
954+ quality_cols = [col for col in model_scores_dict .columns if col .startswith ('quality_' )]
955+ logger .info (f" - model_scores quality columns: { quality_cols } " )
956+
899957 # Convert to the format expected by the rest of the code
900958 # FunctionalMetrics returns DataFrames, convert back to nested dicts
901959 if hasattr (model_cluster_scores_dict , 'to_dict' ):
@@ -1196,10 +1254,96 @@ def cluster_run(req: ClusterRunRequest) -> Dict[str, Any]:
11961254 row ["metadata" ] = metrics .get ("metadata" , {})
11971255
11981256 cluster_scores_array .append (row )
1199-
1200- # Note: model_scores would need to be computed separately if needed
1201- # For now, we'll return an empty array as it's not computed in this endpoint
1202-
1257+
1258+ # Transform model_scores to array format
1259+ model_scores_array = []
1260+ model_scores_dict = scores .get ("model_scores" , {})
1261+
1262+ # Check if model_scores_dict is a DataFrame (from FunctionalMetrics)
1263+ if hasattr (model_scores_dict , 'to_dict' ):
1264+ # It's a DataFrame, convert to dict first
1265+ import pandas as pd
1266+ df = model_scores_dict
1267+
1268+ logger .info (f"🔧 Transforming model_scores DataFrame to array format:" )
1269+ logger .info (f" - DataFrame shape: { df .shape } " )
1270+ logger .info (f" - DataFrame columns: { list (df .columns )} " )
1271+
1272+ for _ , row in df .iterrows ():
1273+ model_name = row ['model' ]
1274+
1275+ model_row = {
1276+ "model" : model_name ,
1277+ "size" : row .get ('size' , 0 ),
1278+ # Don't include 'cluster' field for model_scores (it's aggregated across all clusters)
1279+ }
1280+
1281+ # Flatten quality metrics: quality_helpfulness -> quality_helpfulness
1282+ for col in df .columns :
1283+ if col .startswith ('quality_' ) and not col .startswith ('quality_delta_' ):
1284+ if not any (x in col for x in ['_ci_' , '_significant' ]):
1285+ model_row [col ] = row [col ]
1286+ elif col .startswith ('quality_delta_' ):
1287+ if not any (x in col for x in ['_ci_' , '_significant' ]):
1288+ model_row [col ] = row [col ]
1289+
1290+ # Add confidence intervals if they exist
1291+ for col in df .columns :
1292+ if '_ci_lower' in col or '_ci_upper' in col or '_significant' in col :
1293+ model_row [col ] = row [col ]
1294+
1295+ model_scores_array .append (model_row )
1296+ else :
1297+ # It's already a dict, transform it similar to cluster_scores
1298+ for model_name , metrics in model_scores_dict .items ():
1299+ if not isinstance (metrics , dict ):
1300+ continue
1301+
1302+ row = {
1303+ "model" : model_name ,
1304+ "size" : metrics .get ("size" , 0 ),
1305+ }
1306+
1307+ # Flatten quality metrics
1308+ quality = metrics .get ("quality" )
1309+ if quality and isinstance (quality , dict ):
1310+ for metric_name , metric_value in quality .items ():
1311+ row [f"quality_{ metric_name } " ] = metric_value
1312+
1313+ # Flatten quality_delta metrics
1314+ quality_delta = metrics .get ("quality_delta" )
1315+ if quality_delta and isinstance (quality_delta , dict ):
1316+ for metric_name , metric_value in quality_delta .items ():
1317+ row [f"quality_{ metric_name } _delta" ] = metric_value
1318+
1319+ # Add confidence intervals if they exist
1320+ quality_ci = metrics .get ("quality_ci" , {})
1321+ for metric_name , ci_dict in quality_ci .items ():
1322+ if isinstance (ci_dict , dict ):
1323+ row [f"quality_{ metric_name } _ci_lower" ] = ci_dict .get ("lower" )
1324+ row [f"quality_{ metric_name } _ci_upper" ] = ci_dict .get ("upper" )
1325+
1326+ quality_delta_ci = metrics .get ("quality_delta_ci" , {})
1327+ for metric_name , ci_dict in quality_delta_ci .items ():
1328+ if isinstance (ci_dict , dict ):
1329+ row [f"quality_delta_{ metric_name } _ci_lower" ] = ci_dict .get ("lower" )
1330+ row [f"quality_delta_{ metric_name } _ci_upper" ] = ci_dict .get ("upper" )
1331+
1332+ # Add significance flags if they exist
1333+ quality_delta_significant = metrics .get ("quality_delta_significant" , {})
1334+ for metric_name , is_significant in quality_delta_significant .items ():
1335+ row [f"quality_delta_{ metric_name } _significant" ] = is_significant
1336+
1337+ model_scores_array .append (row )
1338+
1339+ # Log the transformed model_scores
1340+ if model_scores_array :
1341+ logger .info (f"✅ Transformed { len (model_scores_array )} model_scores rows" )
1342+ logger .info (f" - Sample row keys: { list (model_scores_array [0 ].keys ())} " )
1343+ logger .info (f" - Sample row: { model_scores_array [0 ]} " )
1344+ else :
1345+ logger .warning ("⚠️ No model_scores computed - this may indicate missing quality metrics in the data" )
1346+
12031347 # Persist flattened metrics in expected JSONL format for downstream endpoints/loaders
12041348 try :
12051349 if results_dir is not None :
@@ -1212,6 +1356,12 @@ def cluster_run(req: ClusterRunRequest) -> Dict[str, Any]:
12121356 (results_dir / "cluster_scores_df.jsonl" ).write_text (
12131357 cs_df .to_json (orient = 'records' , lines = True )
12141358 )
1359+ # Save model_scores as well
1360+ if model_scores_array :
1361+ ms_df = pd .DataFrame (model_scores_array )
1362+ (results_dir / "model_scores_df.jsonl" ).write_text (
1363+ ms_df .to_json (orient = 'records' , lines = True )
1364+ )
12151365 logger .info (f"✓ Saved metrics JSONL files under: { results_dir } " )
12161366 except Exception as e :
12171367 logger .warning (f"Failed to save metrics JSONL files: { e } " )
@@ -1224,7 +1374,7 @@ def cluster_run(req: ClusterRunRequest) -> Dict[str, Any]:
12241374 "metrics" : {
12251375 "model_cluster_scores" : model_cluster_scores_array ,
12261376 "cluster_scores" : cluster_scores_array ,
1227- "model_scores" : [] # Not computed in clustering-only endpoint
1377+ "model_scores" : model_scores_array # Now properly computed and transformed
12281378 }
12291379 }
12301380
0 commit comments