@@ -100,10 +100,23 @@ def _get_sampling_params(self, gen_kwargs: dict) -> SamplingParams:
100100 "max_gen_toks" : "max_tokens" ,
101101 "until" : "stop" ,
102102 }
103+ # IMPORTANT:
104+ # lm-evaluation-harness controls generation primarily via per-task gen_kwargs.
105+ # For example, the `local-completions` model wrapper uses:
106+ # max_tokens <- gen_kwargs["max_tokens"] or gen_kwargs["max_gen_toks"] or _max_gen_toks
107+ # temperature <- gen_kwargs.get("temperature", 0)
108+ # stop <- gen_kwargs.get("until", ...)
109+ # See: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py
110+
103111 if self .sampling_params is None :
104- sampling_params = SamplingParams ()
112+ sampling_params = SamplingParams (
113+ max_tokens = gen_kwargs .get ("max_gen_toks" , 256 ),
114+ temperature = gen_kwargs .get ("temperature" , 0 ),
115+ stop = gen_kwargs .get ("until" , None ),
116+ )
105117 else :
106118 sampling_params = copy .deepcopy (self .sampling_params )
119+
107120 for lm_eval_key , trtllm_key in params_mapping .items ():
108121 value = gen_kwargs .pop (lm_eval_key , None )
109122 if value is not None :
@@ -714,3 +727,156 @@ def command(ctx, **kwargs) -> None:
714727 kwargs [
715728 "stop" ] = "<|endoftext|>" # NOTE: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmmu/_template_yaml#L10
716729 MMMU .command_harness (ctx , ** kwargs )
730+
731+
732+ class LongBenchV1 (LmEvalEvaluator ):
733+ """
734+ LongBench v1 evaluation via lm-evaluation-harness.
735+
736+ Notes:
737+ - In lm-eval, `longbench` is typically a *group task* that expands into many
738+ subtasks. The base `LmEvalEvaluator.evaluate()` assumes a single task
739+ key exists in `results["results"][task_name]`, so we override evaluation
740+ to aggregate over subtasks.
741+ """
742+
743+ def __init__ (self , ** kwargs ):
744+ super ().__init__ ("longbench" , ** kwargs )
745+
746+ @staticmethod
747+ def _flatten_task_dict (task_dict : dict ) -> List [str ]:
748+ names : List [str ] = []
749+ for k , v in task_dict .items ():
750+ if isinstance (v , dict ):
751+ names .extend (LongBenchV1 ._flatten_task_dict (v ))
752+ else :
753+ names .append (k )
754+ return names
755+
756+ @staticmethod
757+ def _get_group_score (metrics : Dict [str , Any ],
758+ * ,
759+ preferred_filter : str = "none" ) -> Optional [float ]:
760+ """
761+ lm-eval stores group metrics as "<metric>,<filter>" (e.g., "score,none").
762+ Prefer "score,none" (matches printed table), otherwise accept any
763+ "score,<filter>" key.
764+ """
765+ if not isinstance (metrics , dict ):
766+ return None
767+
768+ preferred_key = f"score,{ preferred_filter } "
769+ v = metrics .get (preferred_key , None )
770+ if isinstance (v , (int , float )):
771+ return float (v )
772+
773+ return None
774+
775+ def evaluate (self ,
776+ llm : Union [LLM , PyTorchLLM ],
777+ sampling_params : Optional [SamplingParams ] = None ,
778+ streaming : bool = False ) -> float :
779+ import lm_eval
780+
781+ lm_cls = MultimodalLmEvalWrapper if self .MULTIMODAL else LmEvalWrapper
782+ results = lm_eval .evaluate (
783+ lm = lm_cls (llm ,
784+ sampling_params = sampling_params ,
785+ streaming = streaming ,
786+ chat_template_kwargs = self .chat_template_kwargs ),
787+ task_dict = self .task_dict ,
788+ limit = self .num_samples ,
789+ apply_chat_template = self .apply_chat_template ,
790+ fewshot_as_multiturn = self .fewshot_as_multiturn ,
791+ system_instruction = self .system_prompt )
792+
793+ logger .info (
794+ f"lm-eval { self .task_name } results:\n { lm_eval .utils .make_table (results )} "
795+ )
796+
797+ # LongBench is a group task in lm-eval. lm-eval already computes subgroup
798+ # "score" values (e.g., `longbench_fewshot`, `longbench_single`, ...).
799+ # To keep this implementation simple and aligned with the printed table,
800+ # we compute the final LongBench score as the unweighted mean of subgroup
801+ # scores.
802+ group_results : Dict [str , Dict [str , Any ]] = results .get ("groups" , {})
803+ subgroup_names = results .get ("group_subtasks" ,
804+ {}).get (self .task_name , [])
805+ if not subgroup_names :
806+ raise KeyError (
807+ f"lm-eval did not provide subgroup list for group '{ self .task_name } '. "
808+ "Expected `results['group_subtasks'][task_name]` to exist." )
809+
810+ subgroup_scores : List [float ] = []
811+ missing : List [str ] = []
812+ for name in subgroup_names :
813+ m = group_results .get (name , None )
814+ score = self ._get_group_score (m )
815+ if score is None :
816+ missing .append (name )
817+ else :
818+ subgroup_scores .append (score )
819+
820+ if not subgroup_scores :
821+ raise KeyError (
822+ f"lm-eval did not provide subgroup 'score' metrics for '{ self .task_name } '. "
823+ f"Missing subgroups: { missing [:10 ]} " )
824+
825+ result_acc = float (np .mean (subgroup_scores )) * 100
826+ logger .info (
827+ f"lm-eval { self .task_name } average 'score' across { len (subgroup_scores )} subgroups: { result_acc :.2f} "
828+ )
829+ return result_acc
830+
831+ @click .command ("longbench_v1" )
832+ @click .option (
833+ "--dataset_path" ,
834+ type = str ,
835+ default = None ,
836+ help =
837+ "The path to LongBench dataset. If unspecified, the dataset is downloaded from HF hub."
838+ )
839+ @click .option (
840+ "--num_samples" ,
841+ type = int ,
842+ default = None ,
843+ help = "Number of samples to run the evaluation; None means full dataset."
844+ )
845+ @click .option ("--random_seed" ,
846+ type = int ,
847+ default = 0 ,
848+ help = "Random seed for dataset processing." )
849+ @click .option ("--apply_chat_template" ,
850+ type = click .BOOL ,
851+ default = True ,
852+ show_default = True ,
853+ help = "Whether to apply chat template." )
854+ @click .option (
855+ "--chat_template_kwargs" ,
856+ type = str ,
857+ default = None ,
858+ callback = lambda ctx , param , value : json .loads (value ) if value else None ,
859+ help =
860+ 'Chat template kwargs as JSON string, e.g., \' {"thinking_budget": 0}\' ' )
861+ @click .option ("--system_prompt" ,
862+ type = str ,
863+ default = None ,
864+ help = "System prompt." )
865+ @click .pass_context
866+ @staticmethod
867+ def command (ctx , ** kwargs ) -> None :
868+ llm : Union [LLM , PyTorchLLM ] = ctx .obj
869+
870+ evaluator = LongBenchV1 (
871+ dataset_path = kwargs .pop ("dataset_path" , None ),
872+ num_samples = kwargs .pop ("num_samples" , None ),
873+ random_seed = kwargs .pop ("random_seed" , 0 ),
874+ apply_chat_template = kwargs .pop ("apply_chat_template" , True ),
875+ system_prompt = kwargs .pop ("system_prompt" , None ),
876+ chat_template_kwargs = kwargs .pop ("chat_template_kwargs" , None ))
877+
878+ # Let lm-eval task configs control sampling via gen_kwargs.
879+ sampling_params = None
880+
881+ evaluator .evaluate (llm , sampling_params )
882+ llm .shutdown ()
0 commit comments