microsoft · vidhishanair · Sep 29, 2025 · Sep 30, 2025 · Oct 6, 2025 · Oct 8, 2025
diff --git a/environment.yml b/environment.yml
@@ -37,7 +37,6 @@ dependencies:
   - xz=5.6.4=h5eee18b_1
   - zlib=1.2.13=h5eee18b_1
   - pip:
-      - accelerate==1.6.0
       - anthropic==0.49.0
       - azure-ai-textanalytics>=5.3.0
       - azure-core>=1.29.5
@@ -47,10 +46,10 @@ dependencies:
       - datasets>=3.2.0
       - fuzzywuzzy>=0.18.0
       - jsonlines>=2.0.0
-      - pandas>=2.2.1
       - pillow>=10.0.1
       - torch>=2.6.0
-      - numpy==1.26.4
+      - numpy>=2.2
+      - pandas>=2.2.1
       - tqdm>=4.65.0
       - jinja2>=3.1.3
       - transformers>=4.51.3
@@ -63,8 +62,8 @@ dependencies:
       - google-generativeai>=0.7.0
       - openai>=1.35.5
       - bitsandbytes>=0.42.0
+      - pycocotools>=2.0.10
+      - vllm==0.8.5
       - accelerate>=0.21.0
-      - pycocotools>=2.0.8
-      - vllm>=0.8.5
-      - latex2sympy2>=1.9.1
+      # - latex2sympy2_extended[antlr4_13_2] # optional for mathvision
 prefix: /home/sayouse/miniconda3/envs/myenv
diff --git a/eureka_ml_insights/data_utils/mathvision_utils.py b/eureka_ml_insights/data_utils/mathvision_utils.py
@@ -1,7 +1,6 @@
 """Evaluates output of models for Math-V dataset; following https://github.com/mathllm/MATH-V/tree/main/evaluation"""
 
 from dataclasses import dataclass
-from latex2sympy2 import latex2sympy
 import pandas as pd
 import re
 
@@ -47,6 +46,8 @@ def eval_tuple(s):
     Note:
         This function relies on the latex2sympy function which is assumed to be defined elsewhere in the code.
     """
+    from latex2sympy2_extended import latex2sympy
+
     # Split the string by commas to get individual elements
     sl = s[1:-1].split(',')
 
@@ -89,7 +90,8 @@ def is_equal(asw: str, gt_asw: str) -> bool:
         bool: True if the answers are equivalent, otherwise False.
 
     """
-
+    from latex2sympy2_extended import latex2sympy
+
     # return gt_asw == asw
 
     # Check for empty strings after removing spaces and return False if any of them is empty.

diff --git a/eureka_ml_insights/user_configs/gpqa.py b/eureka_ml_insights/user_configs/gpqa.py
@@ -1,3 +1,4 @@
+import logging
 import os
 
 """This file contains user defined configuration classes for the GPQA dataset."""
@@ -474,7 +475,10 @@ def configure_pipeline(
     ) -> PipelineConfig:
         pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from, **kwargs)
         # data preprocessing
-        self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
-            MultiplyTransform(n_repeats=int(kwargs.get("n_repeats", 5)))
-        )
+        if "n_repeats" in kwargs and int(kwargs["n_repeats"]) != 5:
+            logging.warning(
+                f"n_repeats is set to {kwargs['n_repeats']} in kwargs, but will be overridden to 5 for GPQA_PIPELINE_5Run."
+            )
+        self.data_processing_comp.data_reader_config.init_args["transform"].transforms[-1] = MultiplyTransform(n_repeats=5)
+
         return pipeline
diff --git a/eureka_ml_insights/user_configs/image_understanding/spatial_reasoning.py b/eureka_ml_insights/user_configs/image_understanding/spatial_reasoning.py
@@ -1,4 +1,5 @@
 import os
+from typing import Any
 
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
@@ -11,6 +12,7 @@
     DataReader,
     PrependStringTransform,
     SequenceTransform,
+    SamplerTransform
 )
 from eureka_ml_insights.data_utils.spatial_utils import (
     LowerCaseNoPunctuationConvertNumbers,
@@ -50,7 +52,7 @@ class SPATIAL_REASONING_PAIRS_PIPELINE(ExperimentConfig):
     There is no model_config by default and the model config must be passed in via command lime.
     """
 
-    def configure_pipeline(self, model_config, resume_from=None):
+    def configure_pipeline(self, model_config, resume_from=None, **kwargs: dict[str, Any]):
         # Configure the data processing component.
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,

diff --git a/eureka_ml_insights/user_configs/mathvision.py b/eureka_ml_insights/user_configs/mathvision.py
@@ -103,7 +103,7 @@ def configure_pipeline(
         # Eval Inference component round 1 (answer extraction).
         self.eval_inference_comp = InferenceConfig(
             component_type=Inference,
-            model_config=PERSONAL_GPT4O,
+            model_config=kwargs.get("eval_model_config", PERSONAL_GPT4O),
             data_loader_config=DataSetConfig(
                 MMDataLoader,
                 {"path": os.path.join(self.eval_data_pre_processing.output_dir, "transformed_data.jsonl"), "load_images":False},
@@ -131,7 +131,7 @@ def configure_pipeline(
         # Eval Inference component round 2 (LLM scoring)
         self.eval_inference_comp_two = InferenceConfig(
             component_type=Inference,
-            model_config=PERSONAL_GPT4O,
+            model_config=kwargs.get("eval_model_config", PERSONAL_GPT4O),
             data_loader_config=DataSetConfig(
                 MMDataLoader,
                 {"path": os.path.join(self.eval_data_pre_processing_two.output_dir, "transformed_data.jsonl"), "load_images":False},

diff --git a/eureka_ml_insights/user_configs/omni_math.py b/eureka_ml_insights/user_configs/omni_math.py
@@ -63,8 +63,7 @@ def configure_pipeline(self, model_config=None, resume_from=None, eval_resume_fr
             data_loader_config=DataSetConfig(
                 DataLoader,
                 {
-                    "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
-                    "misc_columns": ["data_point_id","data_repeat_id"]
+                    "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")
                 },
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),

diff --git a/eureka_ml_insights/user_configs/vision_language/spatial_grid.py b/eureka_ml_insights/user_configs/vision_language/spatial_grid.py
@@ -1,4 +1,5 @@
 import os
+from typing import Any
 
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
@@ -11,6 +12,7 @@
     ExtractAnswerGrid,
     PrependStringTransform,
     SequenceTransform,
+    MultiplyTransform,
 )
 from eureka_ml_insights.metrics import CaseInsensitiveMatch, CountAggregator
 from eureka_ml_insights.configs import (
@@ -41,7 +43,7 @@ class SPATIAL_GRID_PIPELINE(ExperimentConfig):
     """This method is used to define an eval pipeline with inference and metric report components,
     on the grid counting dataset."""
 
-    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]) -> PipelineConfig:
         # Configure the data processing component.
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
@@ -51,6 +53,8 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
                     "path": "microsoft/VISION_LANGUAGE",
                     "split": "val",
                     "tasks": "spatial_grid",
+                    "transform": MultiplyTransform(n_repeats=int(kwargs.get("n_repeats", 1))),
+
                 },
             ),
             output_dir=os.path.join(self.log_dir, "data_processing_output"),
@@ -113,8 +117,8 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
 class SPATIAL_GRID_TEXTONLY_PIPELINE(SPATIAL_GRID_PIPELINE):
     """This class extends SPATIAL_GRID_PIPELINE to use text only data."""
 
-    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
-        config = super().configure_pipeline(model_config, resume_from)
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]) -> PipelineConfig:
+        config = super().configure_pipeline(model_config, resume_from, **kwargs)
         self.data_processing_comp.data_reader_config.init_args["tasks"] = (
             "spatial_grid_text_only"
         )
@@ -125,7 +129,7 @@ class SPATIAL_GRID_REPORTING_PIPELINE(SPATIAL_GRID_PIPELINE):
     """This method is used to define an eval pipeline with only a metric report component,
     on the grid counting dataset."""
 
-    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
-        super().configure_pipeline(model_config, resume_from)
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]) -> PipelineConfig:
+        super().configure_pipeline(model_config, resume_from, **kwargs)
         self.evalreporting_comp.data_reader_config.init_args["path"] = resume_from
         return PipelineConfig([self.evalreporting_comp], self.log_dir)
diff --git a/eureka_ml_insights/user_configs/vision_language/spatial_map.py b/eureka_ml_insights/user_configs/vision_language/spatial_map.py
@@ -1,4 +1,5 @@
 import os
+from typing import Any
 
 from eureka_ml_insights.configs.experiment_config import ExperimentConfig
 from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing, DataProcessing, DataJoin
@@ -20,6 +21,7 @@
     MultiplyTransform,
     SequenceTransform,
     RegexTransform,
+    SamplerTransform
 )
 from eureka_ml_insights.metrics import SubstringExistsMatch, BiLevelAggregator, BiLevelCountAggregator, CountAggregator
 
@@ -54,7 +56,7 @@ class SPATIAL_MAP_PIPELINE(ExperimentConfig):
     """This method is used to define an eval pipeline with inference and metric report components,
     on the spatial map dataset."""
 
-    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]) -> PipelineConfig:
         # Configure the data processing component.
         self.data_processing_comp = PromptProcessingConfig(
             component_type=PromptProcessing,
@@ -64,7 +66,10 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
                     "path": "microsoft/VISION_LANGUAGE",
                     "split": "val_noinstruction",
                     "tasks": "spatial_map",
-                    "transform": MultiplyTransform(n_repeats=5),
+                    "transform": SequenceTransform([
+                        # SamplerTransform(sample_count=10, random_seed=1),
+                        MultiplyTransform(n_repeats=int(kwargs.get("n_repeats", 1))),
+                    ]),
                 },
             ),
             prompt_template_path=os.path.join(
@@ -86,7 +91,7 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
             resume_from=resume_from,
-            max_concurrent=10,
+            max_concurrent=int(kwargs.get("max_concurrent", 10)),
         )
 
         self.preeval_data_post_processing_comp = DataProcessingConfig(
@@ -141,13 +146,13 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
 
         self.inference_llm_answer_extract = InferenceConfig(
             component_type=Inference,
-            model_config=OAI_GPT4O_2024_11_20_CONFIG,
+            model_config=kwargs.get("eval_model_config", OAI_GPT4O_2024_11_20_CONFIG),
             data_loader_config=DataSetConfig(
                 DataLoader,
                 {"path": os.path.join(self.filter_empty_answer.output_dir, "transformed_data.jsonl")},
             ),
             output_dir=os.path.join(self.log_dir, "llm_answer_extract_inference_result"),
-            max_concurrent=1
+            max_concurrent=10
         )        
 
         self.data_join = DataJoinConfig(
@@ -443,8 +448,8 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
 class SPATIAL_MAP_COT_PIPELINE(SPATIAL_MAP_PIPELINE):
     """This class extends SPATIAL_MAP_PIPELINE to use a COT prompt."""
 
-    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
-        config = super().configure_pipeline(model_config, resume_from)
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]) -> PipelineConfig:
+        config = super().configure_pipeline(model_config, resume_from, **kwargs)
         self.data_processing_comp.prompt_template_path=os.path.join(
                 os.path.dirname(__file__),
                 "../../prompt_templates/vision_language_templates/cot.jinja",
@@ -454,8 +459,8 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
 class SPATIAL_MAP_TEXTONLY_PIPELINE(SPATIAL_MAP_PIPELINE):
     """This class extends SPATIAL_MAP_PIPELINE to use text only data."""
 
-    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
-        config = super().configure_pipeline(model_config, resume_from)
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]) -> PipelineConfig:
+        config = super().configure_pipeline(model_config, resume_from, **kwargs)
         self.data_processing_comp.data_reader_config.init_args["tasks"] = (
             "spatial_map_text_only"
         )
@@ -464,8 +469,8 @@ def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None)
 class SPATIAL_MAP_COT_TEXTONLY_PIPELINE(SPATIAL_MAP_COT_PIPELINE):
     """This class extends SPATIAL_MAP_PIPELINE to use text only data."""
 
-    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
-        config = super().configure_pipeline(model_config, resume_from)
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]) -> PipelineConfig:
+        config = super().configure_pipeline(model_config, resume_from, **kwargs)
         self.data_processing_comp.data_reader_config.init_args["tasks"] = (
             "spatial_map_text_only"
         )
@@ -476,8 +481,8 @@ class SPATIAL_MAP_REPORTING_PIPELINE(SPATIAL_MAP_PIPELINE):
     """This method is used to define an eval pipeline with only a metric report component,
     on the spatial map dataset."""
 
-    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None) -> PipelineConfig:
-        super().configure_pipeline(model_config, resume_from)
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]) -> PipelineConfig:
+        super().configure_pipeline(model_config, resume_from, **kwargs)
         self.preeval_data_post_processing_comp.data_reader_config.init_args["path"] = resume_from
         # Configure the pipeline
         return PipelineConfig(

diff --git a/setup.py b/setup.py
@@ -22,10 +22,10 @@
         'datasets>=3.2.0',
         'fuzzywuzzy>=0.18.0',
         'jsonlines>=2.0.0',
-        'pandas>=2.2.1',
         'pillow>=10.0.1',
         'torch>=2.6.0',
-        'numpy==1.26.4',
+        'numpy>=2.2',
+        'pandas>=2.2.1',
         'tqdm>=4.65.0',
         'jinja2>=3.1.3',
         'transformers>=4.51.3',
@@ -38,10 +38,10 @@
         'google-generativeai>=0.7.0',
         'openai>=1.35.5',
         'bitsandbytes>=0.42.0',
+        'pycocotools>=2.0.10',
+        'vllm==0.8.5',
         'accelerate>=0.21.0',
-        'pycocotools>=2.0.8',
-        'vllm>=0.8.5',
-        'latex2sympy2>=1.9.1',
+        # 'latex2sympy2_extended[antlr4_13_2]', # optional for mathvision
     ],
     extras_require={
         'llamacpp': [