generative-computing · yelkurdi · Aug 28, 2025 · Aug 28, 2025 · Aug 29, 2025 · Aug 29, 2025
diff --git a/mellea/stdlib/sampling_algos/budget_forcing.py b/mellea/stdlib/sampling_algos/budget_forcing.py
@@ -0,0 +1,148 @@
+from mellea.stdlib.session import MelleaSession
+from mellea.stdlib.base import (
+    CBlock,
+    Component,
+    GenerateLog,
+    ModelOutputThunk,
+)
+import re
+
+def think_budget_forcing(
+    session: MelleaSession,
+    action: CBlock | Component,
+    *,
+    think_max_tokens: int = 4096,
+    answer_max_tokens: int | None = None,
+    start_think_token: str = "<think>",
+    end_think_token: str = "</think>",
+    begin_response_token: str = "",
+    end_response_token: str = "",
+    think_wait_suffix: str = "",
+    answer_suffix: str = "The final answer is:",
+    answer_regex: str = r"\\boxed{.*?}",
+    model_options: dict | None = None,
+    generate_logs: list[GenerateLog] | None = None,
+):
+
+    """Generate with budget forcing using the completions APIs. This relies on raw autocompletion and assumes the model's output is structured in the following form: '<think> ... </think> summary answer'
+    The budget forcing method is proposed in the paper: https://arxiv.org/abs/2501.19393
+    This implementation tries to follow the key outlines in the paper while ensuring stable and fail-safe operation.
+    This is performed via multi-step generation. The model will be called multiple times until requirements are met, in other words, the response will be assembled conditionally.
+
+    Args:
+        think_max_tokens: Budget in number of tokens allocated for the think block
+        answer_max_tokens: Budget in number of tokens allocated for the summary and answer block, None indicates generating till EoS
+        start_think_token: String indicating start of think block, default <think>
+        end_think_token: String indicating end of think block, default </think>
+        begin_response_token: Used by certain models, string indicating start of response block, e.g. "<response>", default None
+        end_response_token: Used by certain models, string indicating end of response block, e.g. "</response>", default None
+        think_wait_suffix: String to append to force continued thinking, e.g. "\nWait" if set to None we will not force additional thinking. Use None for upper-bound budget case
+        answer_suffix: String to append to force a final answer
+        answer_regex: Answer regex which indicates an answer is generated
+
+    Assumptions:
+        -  The chat template is applied on prompt, with think mode enabled
+        -  Model is think mode activated
+        -  enabling prefix-caching improves performance
+
+    Limitations:
+        -  Does not support batching
+    """
+
+    backend = session.backend
+    model_options = backend._simplify_and_merge(model_options, is_chat_context=False)
+
+    responses = []
+    prompt = backend.formatter.print(action)
+    if start_think_token:
+        prompt += start_think_token
+        responses.append(start_think_token)
+
+    # Generate thinking portion
+    # model_options["echo"] = True
+    # model_options["logprobs"] = 1
+    model_options["n"] = 1
+    rem_toks = think_max_tokens
+    gen_tok_count = 0
+    curr_prompt = prompt
+    min_step_len = 10  # minimum character length of step to be considered valid
+
+    # think block indefinite multi-step operation to satisfy user's budget
+    while True:
+        if rem_toks <= 0:  # zero-think case
+            break
+
+        if rem_toks <= min_step_len:  # minimum step length reached
+            break
+
+        model_options["max_tokens"] = rem_toks
+        # TODO workaround to obtain generated token counts
+        # The token count should be relayed by openai's CompletionUsage
+        model_options["logprobs"] = 1  # To get number of generated tokens
+        result = backend._generate_from_raw([prompt], model_options=model_options, generate_logs=generate_logs)
+        gen_tok_count += len(result[0]._meta['oai_completion_response']['logprobs']['token_logprobs'])
+        rem_toks = think_max_tokens - gen_tok_count
+        response = result[0].value
+
+        if think_wait_suffix == "":
+            # non-strict budget form
+            responses.append(response)
+            break
+
+        if rem_toks <= 0:
+            responses.append(response)
+            break
+
+        else:
+            if end_think_token:
+                step = response.split(end_think_token)[0]
+            # model fails to produce thoughts, let's exit
+            if len(step.strip()) <= min_step_len:
+                responses.append(response)
+                break
+
+            # request more steps
+            step = f"{step} {think_wait_suffix}"
+            responses.append(step)
+            curr_prompt += step
+
+    response = "".join(responses)
+    if answer_regex is None or answer_suffix is None:
+        return response, gen_tok_count
+
+    # Now get a final answer if we need to
+    # TODO: Here we check if a final answer exists, technically we should check for an answer outside
+    # The think block, but we will use relaxed requirement of finding any answer in the model's response.
+    # Consider a strict structural approach in the future.
+    # e.g.
+    # answer_blk = response.split(end_think_token)[-1]
+
+    # Check if answer in response
+    matches = re.findall(answer_regex, response, re.DOTALL)
+    if len(matches) > 0:
+        return response, gen_tok_count
+
+    # Answer is not in response, let's force an answer
+    if end_think_token and end_think_token not in response:
+        response += f" {end_think_token}"
+
+    if begin_response_token and begin_response_token not in response:
+        response += f" {begin_response_token}"
+
+    if answer_suffix:
+        response += f" {answer_suffix}"
+
+    # update original prompt with assembled response
+    prompt += response
+    if answer_max_tokens is not None:
+        model_options["max_tokens"] = answer_max_tokens
+
+    else:
+        model_options.pop("max_tokens", None)  # generate unconditionally
+
+    model_options["logprobs"] = 1  # To get number of generated tokens
+    result = backend._generate_from_raw([prompt], model_options=model_options, generate_logs=generate_logs)
+    response += result[0].value
+    gen_tok_count += len(result[0]._meta['oai_completion_response']['logprobs']['token_logprobs'])
+    return response, gen_tok_count
+
diff --git a/test/stdlib_basics/test_think_budget_forcing/.gitignore b/test/stdlib_basics/test_think_budget_forcing/.gitignore
@@ -0,0 +1,2 @@
+vllm.err
+vllm.log
diff --git a/test/stdlib_basics/test_think_budget_forcing/README.md b/test/stdlib_basics/test_think_budget_forcing/README.md
@@ -0,0 +1,23 @@
+
+# Test for OpenAI API served by VLLM
+
+## Requirement
+
+anaconda / miniconda / miniforge.
+
+Make sure to run the test with multiple cores available (e.g. in a cloud instance / cluster job).
+Although you may think 1 core is enough,
+vllm could get stuck due to deadlock if so.
+
+## Installation
+
+Run the `install.sh` script, which needs to be done only once.
+The script creates a new conda environment named "mellea_tbf" only for the purposes of testing or contributing to the think budget-forcing feature.
+
+Run `./install.sh`
+
+## Testing
+
+``` shell
+./run_test.sh
+```
diff --git a/test/stdlib_basics/test_think_budget_forcing/environment.yml b/test/stdlib_basics/test_think_budget_forcing/environment.yml
@@ -0,0 +1,7 @@
+
+name: mellea_tbf
+channels:
+  - conda-forge
+dependencies:
+  - python=3.12                 # note: at the time of writing, xformer (< vllm) has a broken wheel for 3.13. https://github.com/facebookresearch/xformers/issues/740#issuecomment-2753869337
+  - uv
diff --git a/test/stdlib_basics/test_think_budget_forcing/exec_sampling_test.sh b/test/stdlib_basics/test_think_budget_forcing/exec_sampling_test.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+source set_variables.sh
+
+eval "$(conda shell.bash hook)"
+conda activate $ENV_NAME
+
+export LOCAL_TEST_MODEL
+
+python test_think_budget_forcing.py
diff --git a/test/stdlib_basics/test_think_budget_forcing/install.sh b/test/stdlib_basics/test_think_budget_forcing/install.sh
@@ -0,0 +1,22 @@
+#!/bin/bash -xe
+
+source set_variables.sh
+
+conda env remove -y -n $ENV_NAME || true
+conda env create -f $(readlink -f $(dirname $0))/environment.yml
+
+in-conda (){
+    conda run -n $ENV_NAME $@
+}
+
+
+cd ../../../
+in-conda uv pip install -e .
+cd -
+in-conda uv pip install pre-commit
+in-conda uv pip install pytest
+in-conda uv pip install vllm==0.10.0
+in-conda uv pip install outlines
+# in-conda uv pip install unsloth
+in-conda uv pip install ipdb
+
diff --git a/test/stdlib_basics/test_think_budget_forcing/run_test.sh b/test/stdlib_basics/test_think_budget_forcing/run_test.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+source set_variables.sh
+
+eval "$(conda shell.bash hook)"
+conda activate $ENV_NAME
+
+rm $VLLM_LOG $VLLM_ERR
+
+bash ./serve.sh &
+VLLM_PID=$!
+
+trap "kill -SIGINT $VLLM_PID ; wait" EXIT
+
+while sleep 1 ; do
+    if grep -q "Application startup complete." $VLLM_ERR
+    then
+        break
+    fi
+done
+
+bash exec_sampling_test.sh
+
+
diff --git a/test/stdlib_basics/test_think_budget_forcing/serve.sh b/test/stdlib_basics/test_think_budget_forcing/serve.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+source set_variables.sh
+eval "$(conda shell.bash hook)"
+conda activate $ENV_NAME
+export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
+
+echo "launching a vllm server. Logs are found in $(readlink -ef $(dirname $0))/vllm.log"
+      # At the time of writing this code, Granite 4.4 vLLM serving did not support prefix-caching
+      # --enable-prefix-caching \
+vllm serve $LOCAL_TEST_MODEL \
+      --dtype bfloat16 \
+      > $VLLM_LOG \
+      2> $VLLM_ERR
+
+
diff --git a/test/stdlib_basics/test_think_budget_forcing/set_variables.sh b/test/stdlib_basics/test_think_budget_forcing/set_variables.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+PYTHONBREAKPOINT="ipdb.set_trace"
+LOCAL_TEST_MODEL="ibm-granite/granite-4.0-tiny-preview"
+ENV_NAME=mellea_tbf
+DIR=$(readlink -ef $(dirname $0))
+VLLM_LOG=$DIR/vllm.log
+VLLM_ERR=$DIR/vllm.err