Integrate qwen cepo flow

pawelf-cerebras · pawelf-cerebras · commit f35d53429aff · 2025-09-25T18:09:56.000Z
diff --git a/optillm/cepo/cepo.py b/optillm/cepo/cepo.py
@@ -20,18 +20,22 @@ class CepoConfig:
     bestofn_n: int  # number of responses to be generated in best of n stage
     bestofn_temperature: float  # temperature for verifier in best of n stage
     bestofn_max_tokens: int  # maximum number of tokens for verifier in best of n stage
-    bestofn_rating_type: Literal["absolute", "pairwise"]  # type of rating in best of n stage
+    bestofn_rating_type: Literal["absolute", "pairwise", "majority"]  # type of rating in best of n stage
     planning_n: int  # number of plans generated in planning stage
     planning_m: int  # number of attempts to generate n plans in planning stage
     planning_temperature_step1: float  # temperature for generator in step 1 of planning stage
     planning_temperature_step2: float  # temperature for generator in step 2 of planning stage
+    planning_temperature_direct_resp: float  # temperature for generator after step 2 if planning fails and answer directly
     planning_temperature_step3: float  # temperature for generator in step 3 of planning stage
     planning_temperature_step4: float  # temperature for generator in step 4 of planning stage
     planning_max_tokens_step1: int  # maximum number of tokens in step 1 of planning stage
     planning_max_tokens_step2: int  # maximum number of tokens in step 2 of planning stage
+    planning_max_tokens_direct_resp: float  #  maximum number of tokens after step 2 if planning fails and answer directly
     planning_max_tokens_step3: int  # maximum number of tokens in step 3 of planning stage
     planning_max_tokens_step4: int  # maximum number of tokens in step 4 of planning stage
     use_plan_diversity: bool  # whether to use plan diversity
+    use_reasoning_fallback: bool  # whether to fallback to lower levels of reasoning when higher level fails
+    num_of_retries: int  # number of retries if llm call fails, 0 for no retries
     rating_model: Optional[str] = None # model to be used for rating
     print_output: bool = False  # whether to print the output of each stage
 
@@ -203,6 +207,7 @@ def extract_llm_response(response):
 def llm_call(
     client: Any,
     provider_request: dict,
+    cepo_config: CepoConfig
 ) -> tuple[str, str, int]:
     """
     Call the LLM with retries on transient errors.
@@ -220,7 +225,7 @@ def llm_call(
             - finish_reason: Why generation stopped.
             - completion_tokens: Number of tokens generated.
     """
-    retries = 2  # total attempts = retries + 1 initial call
+    retries = cepo_config.num_of_retries  # total attempts = retries + 1 initial call
     for attempt in range(retries):
         try:
             response_object = client.chat.completions.create(
@@ -247,7 +252,8 @@ def llm_call(
 def llm_call_reason_effort_fallback(
     client: Any,
     provider_request: dict,
-    reasoning_effort_levels: list
+    reasoning_effort_levels: list,
+    cepo_config: CepoConfig
 ) -> tuple[Optional[Any], str, int]:
     """
     Call LLM with fallback on reasoning effort levels.
@@ -291,13 +297,16 @@ def llm_call_reason_effort_fallback(
           automatically, but a permanent fix may require upstream changes
           (see https://github.com/pydantic/pydantic-ai/issues/2449).
     """
+    if not cepo_config.use_reasoning_fallback:
+        reasoning_effort_levels = ["high"]
     for effort in reasoning_effort_levels:
         try:
             # Try with the current reasoning effort level
             provider_request["reasoning_effort"] = effort
             response, finish_reason, completion_tokens = llm_call(
                 client=client,
                 provider_request=provider_request,
+                cepo_config=cepo_config
             )
             if response is not None and finish_reason != "length":
                 return response, finish_reason, completion_tokens
@@ -310,27 +319,6 @@ def llm_call_reason_effort_fallback(
     return None, "error", 0
 
 
-def fallback_direct_answer(client, model, question, max_tokens=None, temperature=1.0, top_p=1.0): # TODO clean-up
-    messages = [
-        {"role": "user", "content": question},
-    ]
-
-    response, finish_reason, completion_tokens = llm_call_reason_effort_fallback(
-                messages=messages,
-                client=client,
-                model=model,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                reasoning_effort_levels=["high", "medium", "low"]
-            )
-    if response is None or finish_reason == "length":
-        print("Direct answer failed, empty response or length")
-        response = ""
-    messages.append({"role": "assistant", "content": response})
-    return response, messages
-
-
 def generate_completion(system_prompt: str, task: str, client: Any, model: str, cepo_config: CepoConfig, approach: Optional[str] = None, request_id: str = None) -> str:
     """
     Generates a completion based on the provided system prompt and task.
@@ -385,7 +373,8 @@ def generate_single_plan(i):
             response, finish_reason, completion_tokens = llm_call_reason_effort_fallback(
                 client=client,
                 provider_request=provider_request,
-                reasoning_effort_levels=["high", "medium"]
+                reasoning_effort_levels=["high", "medium"],
+                cepo_config=cepo_config
             )
             local_completion_tokens += completion_tokens
             # Log provider call if conversation logging is enabled
@@ -418,7 +407,8 @@ def generate_single_plan(i):
         response, finish_reason, completion_tokens = llm_call_reason_effort_fallback(
                 client=client,
                 provider_request=provider_request,
-                reasoning_effort_levels=["high", "medium"]
+                reasoning_effort_levels=["high", "medium"],
+                cepo_config=cepo_config
             )
         local_completion_tokens += completion_tokens
 
@@ -453,10 +443,39 @@ def generate_single_plan(i):
     plans = [plan for _, plan in sorted(plans)]  # keep original order
 
     if not plans:
-        # Fallback plan
-        fallback_generation, fallback_messages = fallback_direct_answer(client, model, question_only)
-        plans.append(fallback_generation)
-        cb_log[f"messages_planning_fallback_used"] = fallback_messages
+        # If no plans were generated, attempt to answer directly
+        messages = [
+            {"role": "user", "content": question_only},
+        ]
+
+        provider_request = {
+            "model": model,
+            "messages": messages,
+            "max_tokens": cepo_config.planning_max_tokens_step2_direct,
+            "temperature":cepo_config.planning_temperature_step2_direct,
+            "top_p": 0.95,
+            "reasoning_effort_levels": ["high", "medium", "low"]
+        }
+
+        response, finish_reason, completion_tokens = llm_call_reason_effort_fallback(
+                    client=client,
+                    provider_request=provider_request,
+                    cepo_config=cepo_config
+                )
+        local_completion_tokens += completion_tokens
+
+        # Log provider call if conversation logging is enabled
+        if hasattr(optillm, 'conversation_logger') and optillm.conversation_logger and request_id:
+            response_dict = response.model_dump() if hasattr(response, 'model_dump') else response
+            optillm.conversation_logger.log_provider_call(request_id, provider_request, response_dict)    
+    
+        if response is None or finish_reason == "length":
+            print("Direct answer failed, empty response or length")
+            response = ""
+        messages.append({"role": "assistant", "content": response})
+
+        plans.append(response)
+        cb_log[f"messages_planning_fallback_used"] = messages
         if cepo_config.print_output:
             print(f"\nCePO: No plans generated successfully. Taking the fallback.\n")
 
@@ -483,7 +502,8 @@ def generate_single_plan(i):
     response, finish_reason, completion_tokens_ = llm_call_reason_effort_fallback(
                 client=client,
                 provider_request=provider_request,
-                reasoning_effort_levels=["high", "medium"]
+                reasoning_effort_levels=["high", "medium"],
+                cepo_config=cepo_config
             )
     completion_tokens += completion_tokens_
 
@@ -519,7 +539,8 @@ def generate_single_plan(i):
         response, finish_reason, completion_tokens_ = llm_call_reason_effort_fallback(
                 client=client,
                 provider_request=provider_request,
-                reasoning_effort_levels=["high", "medium"]
+                reasoning_effort_levels=["high", "medium"],
+                cepo_config=cepo_config
             )
         completion_tokens += completion_tokens_
 
@@ -718,7 +739,8 @@ def rate_completions_absolute(system_prompt: str, initial_query: str, client: An
         rating_response, _, completion_tokens = llm_call_reason_effort_fallback(
                 client=client,
                 provider_request=provider_request,
-                reasoning_effort_levels=["high", "medium"]
+                reasoning_effort_levels=["high", "medium"],
+                cepo_config=cepo_config
             )
 
         # Log provider call if conversation logging is enabled
@@ -906,7 +928,7 @@ def majority_vote_mcq(completions, last_n_chars=100):
             return response, count
         
 
-def rate_completions_majority_vote(completions: list[str], last_n_chars: int = 150) -> tuple[str, int, dict]:
+def rate_completions_majority(completions: list[str], last_n_chars: int = 150) -> tuple[str, int, dict]:
     mcq_majority, count = majority_vote_mcq(completions, last_n_chars)
     if mcq_majority is None:
         return majority_vote_math(completions, last_n_chars)
@@ -948,8 +970,8 @@ def cepo(system_prompt: str, initial_query: str, client: Any, model: str, cepo_c
         best_completion, completion_tokens_rating, cb_log = rate_completions_absolute(system_prompt, initial_query, client, rating_model, completions, cepo_config, cb_log, request_id)
     elif cepo_config.bestofn_rating_type == "pairwise":
         best_completion, completion_tokens_rating, cb_log = rate_completions_pairwise(system_prompt, initial_query, client, rating_model, completions, cepo_config, cb_log, request_id)
-    elif cepo_config.bestofn_rating_type == "majority_with_code_exec":
-        best_completion, _ = rate_completions_majority_vote(completions)
+    elif cepo_config.bestofn_rating_type == "majority":
+        best_completion, _ = rate_completions_majority(completions)
         completion_tokens_rating = 0
     else:
         raise ValueError("Invalid rating type in cepo_config")
diff --git a/optillm/cepo/configs/cepo_config.yaml b/optillm/cepo/configs/cepo_config.yaml
@@ -1,17 +1,20 @@
 bestofn_n: 3
 bestofn_temperature: 0.1
 bestofn_max_tokens: 4096
-bestofn_rating_type: "absolute"  # or "pairwise"
+bestofn_rating_type: "absolute"  # or "pairwise", "majority"
 planning_n: 3
 planning_m: 6
 planning_temperature_step1: 0.55
 planning_temperature_step2: 0.25
+planning_temperature_direct_resp: 0.1
 planning_temperature_step3: 0.1
 planning_temperature_step4: 0
 planning_max_tokens_step1: 4096
 planning_max_tokens_step2: 4096
+planning_max_tokens_direct_resp: 4096
 planning_max_tokens_step3: 4096
 planning_max_tokens_step4: 4096
 use_plan_diversity: False
 rating_model: null
+use_reasoning_effort_fallback: False
 print_output: False
diff --git a/optillm/cepo/configs/cepo_config_gptoss.yaml b/optillm/cepo/configs/cepo_config_gptoss.yaml
@@ -0,0 +1,21 @@
+bestofn_n: 1
+bestofn_temperature: 0.6
+bestofn_max_tokens: 40960
+bestofn_rating_type: "absolute"
+planning_n: 2
+planning_m: 4
+planning_temperature_step1: 1.0
+planning_temperature_step2: 1.0
+planning_temperature_direct_resp: 0.6
+planning_temperature_step3: 1.0
+planning_temperature_step4: 0.5
+planning_max_tokens_step1: 40960
+planning_max_tokens_step2: 40960
+planning_max_tokens_direct_resp: 32768
+planning_max_tokens_step3: 40960
+planning_max_tokens_step4: 40960
+use_plan_diversity: False
+rating_model: null
+use_reasoning_fallback: True
+num_of_retries: 2
+print_output: true
diff --git a/optillm/cepo/configs/cepo_config_qwen3.yaml b/optillm/cepo/configs/cepo_config_qwen3.yaml
@@ -0,0 +1,21 @@
+bestofn_n: 3
+bestofn_temperature: 0.6
+bestofn_max_tokens: 20480
+bestofn_rating_type: "majority"
+planning_n: 2
+planning_m: 4
+planning_temperature_step1: 0.8
+planning_temperature_step2: 0.8
+planning_temperature_direct_resp: 0.6
+planning_temperature_step3: 0.8
+planning_temperature_step4: 0.8
+planning_max_tokens_step1: 28672
+planning_max_tokens_step2: 24576
+planning_max_tokens_direct_resp: 32768
+planning_max_tokens_step3: 20481
+planning_max_tokens_step4: 20482
+use_plan_diversity: False
+rating_model: null
+use_reasoning_fallback: False
+num_of_retries: 0
+print_output: False