@@ -309,6 +309,15 @@ def create_display_name(row):
309309 return display_df , plots
310310
311311
312+ def _agent_with_probably_incomplete_model_usage_info (agent_name ):
313+ # See https://github.com/allenai/astabench-issues/issues/330
314+ lowered_agent_name = agent_name .lower ()
315+ is_elicit = lowered_agent_name == "elicit"
316+ is_scispace = lowered_agent_name == "scispace"
317+ is_you_dot_com = ("you" in lowered_agent_name ) and ("com" in lowered_agent_name )
318+ return any ([is_elicit , is_scispace , is_you_dot_com ])
319+
320+
312321def _get_dataframe (
313322 eval_results : datasets .DatasetDict ,
314323 split : str ,
@@ -333,10 +342,23 @@ def _get_dataframe(
333342 rows = []
334343 for itm in ds :
335344 ev = LeaderboardSubmission .model_validate (itm )
345+ sub = ev .submission
346+
347+ probably_incomplete_model_info = (
348+ _agent_with_probably_incomplete_model_usage_info (sub .agent_name )
349+ )
336350
337351 model_token_counts : dict [str , int ] = {}
338352 if ev .results :
339353 for task_result in ev .results :
354+
355+ if probably_incomplete_model_info :
356+ logger .warning (
357+ f"Dropping model_usages and model_costs for the following submission because model usage info may be incomplete: { sub } ."
358+ )
359+ task_result .model_usages = None
360+ task_result .model_costs = None
361+
340362 if task_result .model_usages :
341363 for usage_list in task_result .model_usages :
342364 for model_usage in usage_list :
@@ -357,7 +379,6 @@ def _get_dataframe(
357379 LB_MODEL_NAME_MAPPING .get (name , name ) for name in sorted_raw_names
358380 ]
359381
360- sub = ev .submission
361382 # only format if submit_time present, else leave as None
362383 ts = sub .submit_time
363384 if ts is not None :
0 commit comments