support reasoning_effort parameter for reaosning models

codelion · codelion · commit aad3062efdd4 · 2025-02-23T11:41:15.000+08:00
diff --git a/optillm/inference.py b/optillm/inference.py
@@ -1321,7 +1321,6 @@ def create(
                 presence_penalty: float = 0,
                 frequency_penalty: float = 0,
                 logit_bias: Optional[Dict[str, float]] = None,
-                user: Optional[str] = None,
                 seed: Optional[int] = None,
                 logprobs: Optional[bool] = None,
                 top_logprobs: Optional[int] = None,
@@ -1337,11 +1336,12 @@ def create(
                 # Entropy specific params
                 top_k: int = 27,
                 min_p: float = 0.03,
-                thought_switch_tokens: List[str] = ["Wait,", "Alternatively,"],
-                min_thinking_tokens: int = 512,
-                max_thinking_tokens: int = 2048,
-                max_thoughts: int = 4,
-                num_traces: int = 1,
+                # Thinking specific params
+                reasoning_effort: str = "low",
+                thought_switch_tokens: List[str] = [],
+                min_thinking_tokens: Optional[int] = None,
+                max_thinking_tokens: Optional[int] = None,
+                max_thoughts: Optional[int] = None,
                 prefill: str = "",
                 start_think_token: str ="<think>",
                 end_think_token: str = "</think>",
@@ -1443,15 +1443,21 @@ def create(
                                 pipeline.current_model = pipeline.current_model.to(original_dtype)
 
                         elif decoding == "thinkdeeper":
-                            thinkdeeper_config = {
-                                "thought_switch_tokens": thought_switch_tokens,
-                                "min_thinking_tokens": min_thinking_tokens,
-                                "max_thinking_tokens": max_thinking_tokens,
-                                "max_thoughts": max_thoughts,
-                                "prefill": prefill,
+                            # Get base config for reasoning effort
+                            thinkdeeper_config = get_effort_profile(reasoning_effort)
+                            
+                            # Override with any custom parameters
+                            custom_config = {
+                                "min_thinking_tokens": min_thinking_tokens if min_thinking_tokens is not None else thinkdeeper_config["min_thinking_tokens"],
+                                "max_thinking_tokens": max_thinking_tokens if max_thinking_tokens is not None else thinkdeeper_config["max_thinking_tokens"],
+                                "max_thoughts": max_thoughts if max_thoughts is not None else thinkdeeper_config["max_thoughts"],
+                                "thought_switch_tokens": thought_switch_tokens if thought_switch_tokens else thinkdeeper_config["thought_switch_tokens"],
+                                "prefill": prefill if prefill else thinkdeeper_config["prefill"],
                                 "start_think_token": start_think_token,
                                 "end_think_token": end_think_token,
                             }
+                            thinkdeeper_config.update(custom_config)
+
                             result = thinkdeeper_decode(
                                 pipeline.current_model,
                                 pipeline.tokenizer,
@@ -1584,3 +1590,65 @@ def parse_model_string(model: str) -> ModelConfig:
         enable_prompt_caching=False,
         dynamic_temperature=False,
     )
+
+# Low Reasoning Effort
+# Suitable for:
+# - Simple, straightforward questions
+# - Quick clarifications
+# - Well-defined tasks with clear steps
+LOW_EFFORT = {
+    "min_thinking_tokens": 256,     # ~100-200 words minimum
+    "max_thinking_tokens": 512,     # ~200-400 words maximum
+    "max_thoughts": 2,              # Allow only one alternative perspective
+    "thought_switch_tokens": [
+        "However,",                 # Single alternative consideration
+        "Wait,",
+        "Alternatively,",
+    ],
+    "prefill": "Let me think about this briefly..."
+}
+
+# Medium Reasoning Effort
+# Suitable for:
+# - Moderate complexity problems
+# - Analysis requiring multiple perspectives
+# - Tasks needing detailed explanation
+MEDIUM_EFFORT = {
+    "min_thinking_tokens": 512,     # ~200-400 words minimum
+    "max_thinking_tokens": 1024,    # ~400-800 words maximum
+    "max_thoughts": 4,              # Allow multiple perspective shifts
+    "thought_switch_tokens": [
+        "Additionally,",
+        "Alternatively,",
+        "However,",                 
+        "Wait,",
+    ],
+    "prefill": "Let me analyze this from multiple angles..."
+}
+
+# High Reasoning Effort
+# Suitable for:
+# - Complex problem solving
+# - Deep analysis tasks
+# - Multi-step reasoning chains
+HIGH_EFFORT = {
+    "min_thinking_tokens": 1024,    # ~400-800 words minimum
+    "max_thinking_tokens": 2048,    # ~800-1600 words maximum
+    "max_thoughts": 6,              # Allow extensive exploration
+    "thought_switch_tokens": [
+        "Additionally,",
+        "Alternatively,",
+        "However,",                 
+        "Wait,",
+    ],
+    "prefill": "This requires careful analysis. Let me think through it systematically..."
+}
+
+def get_effort_profile(effort_level: str) -> dict:
+    """Get reasoning effort profile based on specified level."""
+    profiles = {
+        "low": LOW_EFFORT,
+        "medium": MEDIUM_EFFORT,
+        "high": HIGH_EFFORT
+    }
+    return profiles.get(effort_level, LOW_EFFORT)
diff --git a/optillm/thinkdeeper.py b/optillm/thinkdeeper.py
@@ -5,6 +5,7 @@
 import logging
 
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
 
 DEFAULT_CONFIG = {
     "min_thinking_tokens": 512,
@@ -31,14 +32,40 @@ def __init__(self, config: Dict[str, Any], tokenizer, model):
         self._start_think_token = start_tokens[0] if len(start_tokens) == 1 else start_tokens[1]
         self.end_think_token = end_tokens[0] if len(end_tokens) == 1 else end_tokens[1]
         
-        # Get token IDs for thought switching indicators
-        self.thought_switch_tokens = set()
+        # Store thought switch markers as token sequences
+        self.thought_switch_sequences = []
         for phrase in self.config["thought_switch_tokens"]:
+            # Encode without adding special tokens to get exact sequence
             token_ids = self.tokenizer.encode(phrase, add_special_tokens=False)
-            self.thought_switch_tokens.update(token_ids)
+            self.thought_switch_sequences.append(token_ids)
+            logger.debug(f"Encoded '{phrase}' to token sequence: {token_ids}")
+            logger.debug(f"Decoded back: {self.tokenizer.decode(token_ids)}")
         
         # Track thought switches
         self.thought_count = 0
+        self.current_sequence = []  # Track recent tokens for sequence matching
+        self.max_sequence_length = max(len(seq) for seq in self.thought_switch_sequences)
+
+        for phrase, sequence in zip(self.config["thought_switch_tokens"], self.thought_switch_sequences):
+            logger.debug(f"Thought switch marker '{phrase}' encoded as: {sequence}")
+            logger.debug(f"Decoded back as: {self.tokenizer.decode(sequence)}")
+
+    def is_thought_switch(self, token: int) -> bool:
+        """Check if adding this token creates a thought switch sequence."""
+        # Add new token to current sequence
+        self.current_sequence.append(token)
+        
+        # Keep only the most recent tokens that could match our sequences
+        if len(self.current_sequence) > self.max_sequence_length:
+            self.current_sequence = self.current_sequence[-self.max_sequence_length:]
+        
+        # Check if current sequence ends with any thought switch sequence
+        for sequence in self.thought_switch_sequences:
+            if len(sequence) <= len(self.current_sequence) and \
+               self.current_sequence[-len(sequence):] == sequence:
+                return True
+        
+        return False
         
     @torch.inference_mode()
     def reasoning_effort(self, messages) -> str:
@@ -62,11 +89,16 @@ def reasoning_effort(self, messages) -> str:
             out = self.model(input_ids=tokens, past_key_values=kv, use_cache=True)
             logits = out.logits[0, -1, :]
             
-            # Force end think token if we exceed limits
-            if (n_thinking_tokens >= self.config["max_thinking_tokens"] or 
-                self.thought_count >= self.config["max_thoughts"]):
-                next_token = self.end_think_token
+            # Check if we need to force end token
+            force_end = (n_thinking_tokens >= self.config["max_thinking_tokens"] or 
+                        self.thought_count >= self.config["max_thoughts"])
+            
+            if force_end:
                 logger.debug(f"Forcing end think token. Tokens: {n_thinking_tokens}, Thoughts: {self.thought_count}")
+                next_token = self.end_think_token
+                response_chunks.append(self.tokenizer.decode([next_token]))
+                # Break immediately when forcing end token
+                break
             else:
                 next_token = torch.multinomial(
                     torch.softmax(logits, dim=-1), 1
@@ -76,42 +108,56 @@ def reasoning_effort(self, messages) -> str:
             next_str = self.tokenizer.decode([next_token])
             
             # Check if this is a thought-switching token
-            if next_token in self.thought_switch_tokens:
+            if self.is_thought_switch(next_token):
                 self.thought_count += 1
-                logger.debug(f"Detected thought switch. Total thoughts: {self.thought_count}")
+                logger.debug(f"Detected thought switch marker. Total thoughts: {self.thought_count}")
+                # Clear the sequence after detecting a switch
+                self.current_sequence = []
 
-            # Track if we've seen the end think token
+            # Handle natural end think token
             if next_token == self.end_think_token:
                 seen_end_think = True
                 logger.debug("Found end think token")
-
-            # Need to continue generating if:
-            # 1. We hit end think/eos before min tokens OR
-            # 2. We hit eos without seeing end think token
-            if ((next_token in (self.end_think_token, self.model.config.eos_token_id) 
-                 and n_thinking_tokens < self.config["min_thinking_tokens"]) 
-                or (next_token == self.model.config.eos_token_id and not seen_end_think)):
-                
-                # Insert thought transition
-                replacement = random.choice(self.config["thought_switch_tokens"])
-                logger.debug(f"Inserting thought transition: '{replacement}' (tokens: {n_thinking_tokens})")
-                response_chunks.append(replacement)
-                replacement_tokens = self.tokenizer.encode(replacement)
-                n_thinking_tokens += len(replacement_tokens)
-                tokens = torch.tensor([replacement_tokens]).to(tokens.device)
-                self.thought_count += 1
-                seen_end_think = False
-                
-            elif next_token == self.model.config.eos_token_id and seen_end_think:
-                logger.debug("Reached EOS after end think token - stopping generation")
-                break
                 
-            else:
-                response_chunks.append(next_str)
-                n_thinking_tokens += 1
-                tokens = torch.tensor([[next_token]]).to(tokens.device)
+                # If we haven't reached minimum tokens, continue with thought transition
+                if n_thinking_tokens < self.config["min_thinking_tokens"]:
+                    replacement = random.choice(self.config["thought_switch_tokens"])
+                    logger.debug(f"Inserting thought transition: '{replacement}' (tokens: {n_thinking_tokens})")
+                    response_chunks.append(replacement)
+                    replacement_tokens = self.tokenizer.encode(replacement)
+                    n_thinking_tokens += len(replacement_tokens)
+                    tokens = torch.tensor([replacement_tokens]).to(tokens.device)
+                    self.thought_count += 1
+                    seen_end_think = False
+                    continue
+
+            # Handle EOS token
+            if next_token == self.model.config.eos_token_id:
+                if seen_end_think:
+                    logger.debug("Reached EOS after end think token - stopping generation")
+                    break
+                elif n_thinking_tokens < self.config["min_thinking_tokens"]:
+                    # Continue with thought transition if under minimum tokens
+                    replacement = random.choice(self.config["thought_switch_tokens"])
+                    logger.debug(f"Inserting thought transition: '{replacement}' (tokens: {n_thinking_tokens})")
+                    response_chunks.append(replacement)
+                    replacement_tokens = self.tokenizer.encode(replacement)
+                    n_thinking_tokens += len(replacement_tokens)
+                    tokens = torch.tensor([replacement_tokens]).to(tokens.device)
+                    self.thought_count += 1
+                    continue
+                else:
+                    # Force end think token if we haven't seen it
+                    logger.debug("Reached EOS without end think token - adding end token")
+                    response_chunks.append(self.tokenizer.decode([self.end_think_token]))
+                    break
+            
+            # Normal token processing
+            response_chunks.append(next_str)
+            n_thinking_tokens += 1
+            tokens = torch.tensor([[next_token]]).to(tokens.device)
 
-        # Join all chunks and trim off the initial prompt
+        # Join all chunks and add framing tokens
         response = "".join(response_chunks)
         full_response = f"{self.config['start_think_token']}\n{self.config['prefill']}{response}"
         
diff --git a/scripts/eval_optillmbench.py b/scripts/eval_optillmbench.py
@@ -160,7 +160,11 @@ def evaluate_model(
                     {"role": "user", "content": prompt}
                 ],
                 temperature=0.2,
-                max_tokens=4096
+                max_tokens=4096,
+                reasoning_effort="low",
+                extra_body = {
+                    "decoding" : "thinkdeeper",
+                }
             )
             
             # Calculate time taken