Initial commit - think budget-forcing - tests run - WIP

Yousef El-Kurdi · Yousef El-Kurdi · commit bd503cfc7d97 · 2025-08-28T17:03:22.000Z
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
@@ -467,6 +467,155 @@ def _generate_from_chat_context_standard(
 
         return parsed_result
 
+    def generate_with_budget_forcing(
+        self,
+        action: CBlock,
+        *,
+        think_max_tokens: int = 3072,
+        answer_max_tokens: int | None = None,
+        start_think_token: str | None = "<think>",
+        end_think_token: str | None = "</think>",
+        begin_response_token: str | None = None,
+        end_response_token: str | None = None,
+        think_wait_suffix: str | None = None,
+        answer_suffix: str | None = "The final answer is:",
+        answer_token: str | None = "boxed",
+        model_options: dict | None = None,
+    ) -> list[ModelOutputThunk]:
+        """Generate with budget forcing using the completions APIs. This relies on raw autocompletion and assumes the model's output is structued in the following form: '<think> ... </think> summary answer'
+        The budget forcing method is proposed in the paper: https://arxiv.org/abs/2501.19393
+        This implementation tries to follow the key outlines in the paper while ensuring stable and fail-safe operation.
+        This is performed via multi-step generation. The model will be called multiple times until requirements are met, in other words, the response will be assembeled conditionally.
+
+        Args:
+            think_max_tokens: Budget in number of tokens allocated for the think block
+            answer_max_tokens: Budget in number of tokens allocated for the summary and answer block, None indicates generating till EoS
+            start_think_token: String indicating start of think block, default <think>
+            end_think_token: String indicating end of think block, default </think>
+            begin_response_token: Used by certain models, string indicating start of response block, e.g. "<response>", default None
+            end_response_token: Used by certain models, string indicating end of response block, e.g. "</response>", default None
+            think_wait_suffix: String to append to force continued thinking, e.g. "\nWait" if set to None we will not force additional thinking. Use None for upper-bound budget case
+            answer_suffix: String to append to force a final answer
+            answer_token: Token that indicates an answer is generated
+
+        Assumptions:
+            -  The chat template is applied on prompt, with think mode enabled
+            -  Model is think mode activated
+            -  enabling prefix-caching improves performance
+
+        Limitations:
+            -  Does not support batching
+        """
+
+        model_opts = self._simplify_and_merge(model_options, is_chat_context=False)
+
+        responses = []
+        prompt = self.formatter.print(action)
+        if start_think_token is not None:
+            prompt += start_think_token
+            responses.append(start_think_token)
+        backend_opts = self._make_backend_specific_and_remove(
+            model_opts, is_chat_context=False
+        )
+        # Generate thinking portion
+        max_tok_thd = 0.8
+        backend_opts["max_tokens"] = think_max_tokens
+        # backend_opts["echo"] = True
+        # backend_opts["logprobs"] = 1
+        backend_opts["n"] = 1
+        gen_tok_count = 0
+        curr_prompt = prompt
+        min_step_len = 10  # minimum character length of step to be considered valid
+
+        # think block indefinite multi-step operation to satisfy user's budget
+        while True:
+            try:
+                completion_response: Completion = self._client.completions.create(
+                    model=self._hf_model_id, prompt=curr_prompt, **backend_opts
+                )  # type: ignore
+            except openai.BadRequestError as e:
+                if openai_ollama_batching_error in e.message:
+                    FancyLogger.get_logger().error(
+                        "If you are trying to call `OpenAIBackend._generate_from_raw while targeting an ollama server, "
+                        "your requests will fail since ollama doesn't support batching requests."
+                    )
+                raise e
+
+            gen_tok_count += completion_response.usage.completion_tokens
+            response = completion_response.choices[0].text
+            if think_wait_suffix is None:
+                responses.append(response)
+                break
+
+            if gen_tok_count >= max_tok_thd * think_max_tokens:
+                responses.append(response)
+                break
+
+            else:
+                step = response.split(end_think_token)[0]
+                # model fails to produce thoughts, let's exit
+                if len(step.strip()) <= min_step_len:
+                    responses.append(response)
+                    break
+
+                # request more steps
+                step = f"{step} {think_wait_suffix}"
+                responses.append(step)
+                curr_prompt += step
+
+        response = "".join(responses)
+        ### debug obtaining final answer
+        # response = response.split(end_think_token)[0]
+        # response = response.replace(answer_token, "")
+        ###
+        if answer_token is None or answer_suffix is None:
+            return response, gen_tok_count
+
+        # Now get a final answer if we need to
+        # TODO: Here we check if a final answer exists, technically we should check for an answer outside
+        # The think block, but we will use relaxed requirement of finding any answer in the model's response.
+        # Consider a strict structural approach in the future.
+        # e.g.
+        # ans_portion = response.split(end_think_token)[-1]
+        # if answer_token in ans_portion:
+        #     return response, gen_tok_count
+
+        if answer_token in response:
+            return response, gen_tok_count
+
+        # Answer is not in response, let's force an answer
+        if end_think_token not in response:
+            response = (
+                f"{response} {end_think_token}{begin_response_token} {answer_suffix}"
+            )
+
+        else:
+            response = f"{response} {begin_response_token}{answer_suffix}"
+
+        # update original prompt with assembled  response
+        prompt += response
+        if answer_max_tokens is not None:
+            backend_opts["max_tokens"] = answer_max_tokens
+
+        else:
+            del backend_opts["max_tokens"]
+
+        try:
+            completion_response: Completion = self._client.completions.create(
+                model=self._hf_model_id, prompt=prompt, **backend_opts
+            )  # type: ignore
+        except openai.BadRequestError as e:
+            if openai_ollama_batching_error in e.message:
+                FancyLogger.get_logger().error(
+                    "If you are trying to call `OpenAIBackend._generate_from_raw while targeting an ollama server, "
+                    "your requests will fail since ollama doesn't support batching requests."
+                )
+            raise e
+
+        response += completion_response.choices[0].text
+        gen_tok_count += completion_response.usage.completion_tokens
+        return response, gen_tok_count
+
     def _generate_from_raw(
         self,
         actions: list[Component | CBlock],
diff --git a/test/backends/test_think_budget_forcing/.gitignore b/test/backends/test_think_budget_forcing/.gitignore
@@ -0,0 +1,2 @@
+vllm.err
+vllm.log
diff --git a/test/backends/test_think_budget_forcing/README.md b/test/backends/test_think_budget_forcing/README.md
@@ -0,0 +1,23 @@
+
+# Test for OpenAI API served by VLLM
+
+## Requirement
+
+anaconda / miniconda / miniforge.
+
+Make sure to run the test with multiple cores available (e.g. in a cloud instance / cluster job).
+Although you may think 1 core is enough,
+vllm could get stuck due to deadlock if so.
+
+## Installation
+
+Needs to be done only once.
+I creates a new conda environment named "mallea_tbf" only for the purposes of testing or contributing to the think budget-forcing feature.
+
+Run `./install.sh`
+
+## Testing
+
+``` shell
+./run_test.sh
+```
diff --git a/test/backends/test_think_budget_forcing/environment.yml b/test/backends/test_think_budget_forcing/environment.yml
@@ -0,0 +1,7 @@
+
+name: mellea_tbf
+channels:
+  - conda-forge
+dependencies:
+  - python=3.12                 # note: at the time of writing, xformer (< vllm) has a broken wheel for 3.13. https://github.com/facebookresearch/xformers/issues/740#issuecomment-2753869337
+  - uv
diff --git a/test/backends/test_think_budget_forcing/install.sh b/test/backends/test_think_budget_forcing/install.sh
@@ -0,0 +1,18 @@
+#!/bin/bash -xe
+
+ENV_NAME=mellea_tbf
+conda env remove -y -n $ENV_NAME || true
+conda env create -f $(readlink -ef $(dirname $0))/environment.yml
+
+in-conda (){
+    conda run -n $ENV_NAME $@
+}
+
+
+cd ../../../
+in-conda uv pip install -e .
+cd -
+in-conda uv pip install pre-commit
+in-conda uv pip install pytest
+in-conda uv pip install vllm==0.10.0
+in-conda uv pip install outlines
diff --git a/test/backends/test_think_budget_forcing/run_test.sh b/test/backends/test_think_budget_forcing/run_test.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+ENV_NAME=mellea_tbf
+eval "$(conda shell.bash hook)"
+conda activate $ENV_NAME
+
+dir=$(readlink -ef $(dirname $0))
+rm $dir/vllm.log $dir/vllm.err
+
+bash $dir/serve.sh &
+vllm_pid=$!
+
+trap "kill -SIGINT $vllm_pid ; wait" EXIT
+
+while sleep 1 ; do
+    if grep -q "Application startup complete." $dir/vllm.err
+    then
+        break
+    fi
+done
+
+python test_think_budget_forcing.py
+
+
diff --git a/test/backends/test_think_budget_forcing/serve.sh b/test/backends/test_think_budget_forcing/serve.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# @Masa note:
+# the following code is a bash snippet Kristian gave me
+# for how to run vllm with lora adapter loaded.
+
+# HF_GRANITE_ALORA_SNAPSHOT=${HF_HOME:-$HOME/.cache/huggingface}
+# HF_GRANITE_ALORA_SNAPSHOT+=/hub/
+# HF_GRANITE_ALORA_SNAPSHOT+=models--ibm-granite--granite-3.2-8b-alora-requirement-check/
+# HF_GRANITE_ALORA_SNAPSHOT+=snapshots/d55a7a7f5796609bc938c5c151a864cfcc6ab54e
+
+# vllm serve ibm-granite/granite-3.2-8b-instruct \
+#       --enable-lora \
+#       --lora-modules "{\"name\": \"ibm-granite/granite-3.2-8b-alora-requirement-check\", \"path\": \"${HF_GRANITE_ALORA_SNAPSHOT}\", \"base_model_name\": \"ibm-granite/granite-3.2-8b-instruct\"}" \
+#       --dtype bfloat16 \
+#       --max-lora-rank 64 \
+#       --enable-prefix-caching
+
+# However, in our test, we do not load the alora when we serve.
+# In this test, we use the dynamic loading interface from
+# https://docs.vllm.ai/en/stable/features/lora.html#dynamically-serving-lora-adapters
+
+# Using this feature requires the following environment variable.
+# If you use conda/miniforge,
+# this variable must have been set already when you set up the environment.
+# see environment.yml.
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+
+echo "launching a vllm server. Logs are found in $(readlink -ef $(dirname $0))/vllm.log"
+      # At the time of writing this code, Granite 4.4 vLLM serving did not support prefix-caching
+      # --enable-prefix-caching \
+vllm serve ibm-granite/granite-4.0-tiny-preview \
+      --dtype bfloat16 \
+      > $(readlink -ef $(dirname $0))/vllm.log \
+      2> $(readlink -ef $(dirname $0))/vllm.err
+
+
diff --git a/test/backends/test_think_budget_forcing/test_think_budget_forcing.py b/test/backends/test_think_budget_forcing/test_think_budget_forcing.py
@@ -0,0 +1,84 @@
+from mellea import MelleaSession
+from mellea.stdlib.base import CBlock, SimpleContext
+from mellea.backends.openai import OpenAIBackend
+from transformers import AutoTokenizer
+import pytest
+import os
+
+class TestOpenAIBackend:
+    model_id = "ibm-granite/granite-4.0-tiny-preview"
+    backend = OpenAIBackend(
+        model_id=model_id,
+        base_url="http://0.0.0.0:8000/v1",
+        api_key="EMPTY",
+    )
+    m = MelleaSession(backend, ctx=SimpleContext())
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+
+    def prepare_prmpt_for_math(self, query):
+        # Preparing prompt for math reasoning tasks
+        system_prompt = None  # Use default of chat template
+        prompt_suffix = "\nPlease reason step by step, use \n\n to end each step, and put your final answer within \\boxed{}."
+
+        if prompt_suffix:
+            query += prompt_suffix
+
+        msg = []
+        if system_prompt is not None:
+            msg.append({"role": "system", "content": system_prompt})
+
+        msg.append({"role": "user", "content": query})
+        prompt = self.tokenizer.apply_chat_template(
+            msg,
+            tokenize=False,
+            thinking=True,
+            add_generation_prompt=True,
+        )
+
+        return prompt
+
+    def test_generate_from_raw_small(self):
+        prompt = "what is 1+1?"
+        prompt = self.prepare_prmpt_for_math(prompt)
+        action = CBlock(value=prompt)
+        results = []
+        THINK_MAX_TOKENS = 64
+        ANSWER_MAX_TOKENS = 16
+        result, gen_tok_cnt = self.m.backend.generate_with_budget_forcing(
+            action=action,
+            think_max_tokens=THINK_MAX_TOKENS,
+            answer_max_tokens=ANSWER_MAX_TOKENS,
+            start_think_token = "<think>",
+            end_think_token="</think>",
+            think_wait_suffix="Wait",
+            answer_suffix="The final answer is:",
+            # answer_suffix="",
+            answer_token="boxed",
+        )
+
+        assert gen_tok_cnt <= 2 * THINK_MAX_TOKENS
+
+
+    def test_generate_from_raw_large(self):
+        prompt = "what is 1+1?"
+        prompt = self.prepare_prmpt_for_math(prompt)
+        action = CBlock(value=prompt)
+        results = []
+        THINK_MAX_TOKENS = 1024
+        ANSWER_MAX_TOKENS = 256
+        result, gen_tok_cnt = self.m.backend.generate_with_budget_forcing(
+            action=action,
+            think_max_tokens=THINK_MAX_TOKENS,
+            answer_max_tokens=ANSWER_MAX_TOKENS,
+            start_think_token = "<think>",
+            end_think_token="</think>",
+            think_wait_suffix="Wait",
+            answer_suffix="The final answer is:",
+            answer_token="boxed",
+        )
+
+        assert gen_tok_cnt >= 0.5 * THINK_MAX_TOKENS
+
+
+if __name__ == "__main__":
+    pytest.main(["-s", __file__])