|  | 
| 36 | 36 | from lm_eval.models.hf_vlms import HFMultimodalLM | 
| 37 | 37 | from lm_eval.models.huggingface import HFLM as eval_wrapper | 
| 38 | 38 | from lm_eval.tasks import get_task_dict | 
| 39 |  | -from torchtune import utils | 
| 40 |  | -from torchtune.data import ( | 
| 41 |  | -    format_content_with_images, | 
| 42 |  | -    left_pad_sequence, | 
| 43 |  | -    Message, | 
| 44 |  | -    padded_collate_tiled_images_and_mask, | 
| 45 |  | -) | 
| 46 |  | -from torchtune.generation import generate, sample | 
| 47 |  | - | 
| 48 |  | -from torchtune.modules.common_utils import local_kv_cache | 
| 49 |  | -from torchtune.modules.model_fusion import DeepFusionModel | 
| 50 |  | -from torchtune.modules.transforms import Transform | 
| 51 | 39 | 
 | 
| 52 | 40 | 
 | 
| 53 | 41 | def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill( | 
| @@ -209,6 +197,20 @@ class VLMEvalWrapper(HFMultimodalLM): | 
| 209 | 197 |             the max number of images in MMMU. | 
| 210 | 198 |     """ | 
| 211 | 199 | 
 | 
|  | 200 | +    # Having the imports here allow running other evals without installing torchtune | 
|  | 201 | +    from torchtune import utils | 
|  | 202 | +    from torchtune.data import ( | 
|  | 203 | +        format_content_with_images, | 
|  | 204 | +        left_pad_sequence, | 
|  | 205 | +        Message, | 
|  | 206 | +        padded_collate_tiled_images_and_mask, | 
|  | 207 | +    ) | 
|  | 208 | +    from torchtune.generation import generate, sample | 
|  | 209 | + | 
|  | 210 | +    from torchtune.modules.common_utils import local_kv_cache | 
|  | 211 | +    from torchtune.modules.model_fusion import DeepFusionModel | 
|  | 212 | +    from torchtune.modules.transforms import Transform | 
|  | 213 | + | 
| 212 | 214 |     def __init__( | 
| 213 | 215 |         self, | 
| 214 | 216 |         model: DeepFusionModel, | 
|  | 
0 commit comments