backend interface using _raw_generate

Yousef El-Kurdi · Yousef El-Kurdi · commit e013f921fa8e · 2025-09-05T03:22:40.000Z
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
@@ -482,6 +482,7 @@ def generate_with_budget_forcing(
         answer_suffix: str = "The final answer is:",
         answer_regex: str = "boxed",
         model_options: dict | None = None,
+        generate_logs: list[GenerateLog] | None = None,
     ) -> tuple[str, int]:
         """Generate with budget forcing using the completions APIs. This relies on raw autocompletion and assumes the model's output is structured in the following form: '<think> ... </think> summary answer'
         The budget forcing method is proposed in the paper: https://arxiv.org/abs/2501.19393
@@ -537,23 +538,13 @@ def generate_with_budget_forcing(
                 break
 
             backend_opts["max_tokens"] = rem_toks
-            try:
-                completion_response = self._client.completions.create(
-                    model=self._hf_model_id, prompt=curr_prompt, **backend_opts
-                )  # type: ignore
-            except openai.BadRequestError as e:
-                if openai_ollama_batching_error in e.message:
-                    FancyLogger.get_logger().error(
-                        "If you are trying to call `OpenAIBackend.generate_with_budget_forcing while targeting an ollama server, "
-                        "your requests will fail since ollama doesn't support batching requests."
-                    )
-                raise e
-
-            # Necessary for type checker.
-            assert isinstance(completion_response.usage, CompletionUsage)
-            gen_tok_count += completion_response.usage.completion_tokens
+            # TODO workaround to obtain generated token counts
+            # The token count should be relayed by openai's CompletionUsage
+            backend_opts["logprobs"] = 1  # To get number of generated tokens
+            result = self._generate_from_raw([prompt], model_options=backend_opts, generate_logs=generate_logs)
+            gen_tok_count += len(result[0]._meta['oai_completion_response']['logprobs']['token_logprobs'])
             rem_toks = think_max_tokens - gen_tok_count
-            response = completion_response.choices[0].text
+            response = result[0].value
 
             if think_wait_suffix == "":
                 # non-strict budget form
@@ -611,22 +602,10 @@ def generate_with_budget_forcing(
         else:
             backend_opts.pop("max_tokens", None)  # generate unconditionally
 
-        try:
-            completion_response = self._client.completions.create(
-                model=self._hf_model_id, prompt=prompt, **backend_opts
-            )  # type: ignore
-        except openai.BadRequestError as e:
-            if openai_ollama_batching_error in e.message:
-                FancyLogger.get_logger().error(
-                    "If you are trying to call `OpenAIBackend.generate_with_budget_forcing while targeting an ollama server, "
-                    "your requests will fail since ollama doesn't support batching requests."
-                )
-            raise e
-
-        # Necessary for type checker.
-        assert isinstance(completion_response.usage, CompletionUsage)
-        response += completion_response.choices[0].text
-        gen_tok_count += completion_response.usage.completion_tokens
+        backend_opts["logprobs"] = 1  # To get number of generated tokens
+        result = self._generate_from_raw([prompt], model_options=backend_opts, generate_logs=generate_logs)
+        response += result[0].value
+        gen_tok_count += len(result[0]._meta['oai_completion_response']['logprobs']['token_logprobs'])
         return response, gen_tok_count
 
     def _generate_from_raw(
diff --git a/test/backends/test_think_budget_forcing/install.sh b/test/backends/test_think_budget_forcing/install.sh
@@ -16,3 +16,6 @@ in-conda uv pip install pre-commit
 in-conda uv pip install pytest
 in-conda uv pip install vllm==0.10.0
 in-conda uv pip install outlines
+# in-conda uv pip install unsloth
+in-conda uv pip install ipdb
+
diff --git a/test/backends/test_think_budget_forcing/run_test.sh b/test/backends/test_think_budget_forcing/run_test.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+export PYTHONBREAKPOINT="ipdb.set_trace"
+export LOCAL_TEST_MODEL="ibm-granite/granite-4.0-tiny-preview"
+# export LOCAL_TEST_MODEL="unsloth/Llama-3.2-1B"
+
 ENV_NAME=mellea_tbf
 eval "$(conda shell.bash hook)"
 conda activate $ENV_NAME
diff --git a/test/backends/test_think_budget_forcing/serve.sh b/test/backends/test_think_budget_forcing/serve.sh
@@ -1,35 +1,11 @@
 #!/bin/bash
 
-# @Masa note:
-# the following code is a bash snippet Kristian gave me
-# for how to run vllm with lora adapter loaded.
-
-# HF_GRANITE_ALORA_SNAPSHOT=${HF_HOME:-$HOME/.cache/huggingface}
-# HF_GRANITE_ALORA_SNAPSHOT+=/hub/
-# HF_GRANITE_ALORA_SNAPSHOT+=models--ibm-granite--granite-3.2-8b-alora-requirement-check/
-# HF_GRANITE_ALORA_SNAPSHOT+=snapshots/d55a7a7f5796609bc938c5c151a864cfcc6ab54e
-
-# vllm serve ibm-granite/granite-3.2-8b-instruct \
-#       --enable-lora \
-#       --lora-modules "{\"name\": \"ibm-granite/granite-3.2-8b-alora-requirement-check\", \"path\": \"${HF_GRANITE_ALORA_SNAPSHOT}\", \"base_model_name\": \"ibm-granite/granite-3.2-8b-instruct\"}" \
-#       --dtype bfloat16 \
-#       --max-lora-rank 64 \
-#       --enable-prefix-caching
-
-# However, in our test, we do not load the alora when we serve.
-# In this test, we use the dynamic loading interface from
-# https://docs.vllm.ai/en/stable/features/lora.html#dynamically-serving-lora-adapters
-
-# Using this feature requires the following environment variable.
-# If you use conda/miniforge,
-# this variable must have been set already when you set up the environment.
-# see environment.yml.
 export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 
 echo "launching a vllm server. Logs are found in $(readlink -ef $(dirname $0))/vllm.log"
       # At the time of writing this code, Granite 4.4 vLLM serving did not support prefix-caching
       # --enable-prefix-caching \
-vllm serve ibm-granite/granite-4.0-tiny-preview \
+vllm serve $LOCAL_TEST_MODEL \
       --dtype bfloat16 \
       > $(readlink -ef $(dirname $0))/vllm.log \
       2> $(readlink -ef $(dirname $0))/vllm.err
diff --git a/test/backends/test_think_budget_forcing/test_think_budget_forcing.py b/test/backends/test_think_budget_forcing/test_think_budget_forcing.py
@@ -1,19 +1,35 @@
 from mellea import MelleaSession
+from mellea.backends.model_ids import OPENAI_GPT_OSS_20B, META_LLAMA_3_2_1B, IBM_GRANITE_4_TINY_PREVIEW_7B
 from mellea.stdlib.base import CBlock, SimpleContext
 from mellea.backends.openai import OpenAIBackend
+from mellea.backends.formatter import TemplateFormatter
 from transformers import AutoTokenizer
 import pytest
 import os
 
+
 class TestOpenAIBackend:
+    MODEL_ID = os.environ.get("LOCAL_TEST_MODEL", META_LLAMA_3_2_1B)
+    # Local testing mode
+    if MODEL_ID == "ibm-granite/granite-4.0-tiny-preview":
+        MODEL_ID = IBM_GRANITE_4_TINY_PREVIEW_7B
+
+    elif MODEL_ID == "unsloth/Llama-3.2-1B":
+        MODEL_ID = META_LLAMA_3_2_1B
+
+    else:
+        raise RuntimeError(f"Unsupported model-id:{MODEL_ID}")
+
     model_id = "ibm-granite/granite-4.0-tiny-preview"
     backend = OpenAIBackend(
-        model_id=model_id,
-        base_url="http://0.0.0.0:8000/v1",
-        api_key="EMPTY",
+        model_id=MODEL_ID,
+        formatter=TemplateFormatter(model_id=MODEL_ID),
+        base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:8000')}/v1",
+        api_key="ollama",
     )
+
     m = MelleaSession(backend, ctx=SimpleContext())
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID.hf_model_name, trust_remote_code=True)
 
     def prepare_prmpt_for_math(self, query):
         # Preparing prompt for math reasoning tasks
@@ -28,12 +44,16 @@ def prepare_prmpt_for_math(self, query):
             msg.append({"role": "system", "content": system_prompt})
 
         msg.append({"role": "user", "content": query})
-        prompt = self.tokenizer.apply_chat_template(
-            msg,
-            tokenize=False,
-            thinking=True,
-            add_generation_prompt=True,
-        )
+        if self.tokenizer.chat_template is None:
+            raise RuntimeError(f"No explicit chat template is defined for model-id: ")
+
+        else:
+            prompt = self.tokenizer.apply_chat_template(
+                msg,
+                tokenize=False,
+                thinking=True,
+                add_generation_prompt=True,
+            )
 
         return prompt