Adding reasoning (#326)

sashavor · regisss · IlyasMoutawwakil · web-flow · commit e7fe2600b577 · 2025-08-19T21:58:36.000+02:00
Co-authored-by: regisss &lt;15324346+regisss@users.noreply.github.com&gt;
Co-authored-by: IlyasMoutawwakil &lt;moutawwakil.ilyas.tsi@gmail.com&gt;
diff --git a/energy_star/text_generation.yaml b/energy_star/text_generation.yaml
@@ -15,15 +15,20 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
-  no_weights: true
+  no_weights: False
   task: text-generation
-  model: facebook/opt-125m
+  model: openai/gpt-oss-20b
+  torch_dtype: auto
+  device_map: auto
 
 scenario:
   dataset_name: EnergyStarAI/text_generation
   text_column_name: text
   num_samples: 1000
   truncation: True
+  reasoning: True
+  reasoning_params:
+    reasoning_effort: high
 
   input_shapes:
     batch_size: 1
diff --git a/optimum_benchmark/preprocessors/dataset_preprocessor.py b/optimum_benchmark/preprocessors/dataset_preprocessor.py
@@ -292,13 +292,34 @@ def tokenize_function(examples):
             padding=padding,
         )
 
-    dataset = dataset.map(
-        function=tokenize_function,
-        desc="Running tokenizer on dataset",
-        remove_columns=dataset.features,
-        writer_batch_size=50,
-        batched=True,
-    ).with_format("torch")
+    def reasoning_tokenize_function(examples):
+        return pretrained_processor.apply_chat_template(
+            [{"role": "user", "content": examples[scenario_config.text_column_name]}],
+            truncation=scenario_config.truncation,
+            max_length=min(max_length, 2048) - new_tokens,
+            padding=padding,
+            add_generation_prompt=True,
+            enable_thinking=True,
+            tokenize=True,
+            return_dict=True,
+            **scenario_config.reasoning_params,
+        )
+
+    if scenario_config.reasoning:
+        dataset = dataset.map(
+            function=reasoning_tokenize_function,
+            desc="Running reasoning tokenizer on dataset",
+            remove_columns=dataset.features,
+        ).with_format("torch")
+
+    else:
+        dataset = dataset.map(
+            function=tokenize_function,
+            desc="Running tokenizer on dataset",
+            remove_columns=dataset.features,
+            writer_batch_size=50,
+            batched=True,
+        ).with_format("torch")
 
     return dataset
 
diff --git a/optimum_benchmark/scenarios/energy_star/config.py b/optimum_benchmark/scenarios/energy_star/config.py
@@ -34,7 +34,10 @@ class EnergyStarConfig(ScenarioConfig):
     dataset_prefix1: str = field(default="", metadata={"help": "Prefix to add to text2textgeneration input."})
     dataset_prefix2: str = field(default="", metadata={"help": "Prefix to add to text2textgeneration input."})
     t5_task: str = field(default="", metadata={"help": "Task for categorizing text2textgeneration tasks."})
-
+    reasoning: Union[bool, str] = field(default=False, metadata={"help": "To activate reasoning mode."})
+    reasoning_params: Dict[str, Any] = field(
+        default_factory=dict, metadata={"help": "Additional parameters for reasoning model."}
+    )
     # image dataset options
     image_column_name: str = field(default="image", metadata={"help": "Name of the column with the image input."})
     resize: Union[bool, str] = field(default=False, metadata={"help": "To resize the input images."})