Skip to content
42 changes: 42 additions & 0 deletions docs/examples/best_of_n/prm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Example of Using Best of N with PRMs"""

from docs.examples.helper import w
from mellea import start_session
from mellea.backends.process_reward_models.huggingface.prms import (
HFGenerativePRM,
HFRegressionPRM,
)
from mellea.backends.types import ModelOption
from mellea.stdlib.rewards.prm_scorer import PRMScorer
from mellea.stdlib.sampling import BestofNSamplingStrategy

# create a session for the generator using Granite 3.3 8B on Huggingface and a simple context [see below]
m = start_session(backend_name="hf", model_options={ModelOption.MAX_NEW_TOKENS: 512})

# initialize the PRM model
prm_model = HFGenerativePRM(
model_name_or_path="ibm-granite/granite-3.3-8b-lora-math-prm",
score_token="Y",
generation_prompt="Is this response correct so far (Y/N)?",
step_separator="\n\n",
)

# # can also initialize a Regression PRM model
# prm_model = HFRegressionPRM(
# model_name_or_path = "granite-3.3-8b-math-prm-regression",
# score_token= "<end_of_step>",
# step_separator= "\n\n")

# create PRM scorer object
prm = PRMScorer(prm_model=prm_model, preference_ordering="max")

# Do Best of N sampling with the PRM scorer and an additional requirement
BoN_prm = m.instruct(
"Sarah has 12 apples. She gives 5 of them to her friend. How many apples does Sarah have left?",
strategy=BestofNSamplingStrategy(loop_budget=3),
model_options={"temperature": 0.9, "do_sample": True},
requirements=["provide final answer like 'Final Answer:'", prm],
)

# print result
print(f"***** BoN ****\n{w(BoN_prm)}\n*******")
54 changes: 54 additions & 0 deletions mellea/backends/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from mellea.backends.cache import Cache, SimpleLRUCache
from mellea.backends.formatter import Formatter, FormatterBackend, TemplateFormatter
from mellea.backends.model_ids import ModelIdentifier
from mellea.backends.process_reward_models import PRM
from mellea.backends.tools import (
add_tools_from_context_actions,
add_tools_from_model_options,
Expand Down Expand Up @@ -672,3 +673,56 @@ def __init__(
self._generation_prompt_tokens = self._backend._tokenizer(
self._generation_prompt, return_tensors="pt"
).to(self._backend._device)


class HFProcessRewardModel(PRM, abc.ABC):
def __init__(
self, model_name_or_path: str, score_token: str, device: str | None = None
):
"""Initialize an PRM that works with a huggingface backend. Currently supports and tested with IBM Process Reward Models

Args:
model_name_or_path (str): A local path to PRM or a huggingface PRM
score_token (str): token who's logits correspond to the PRM score. Can be a step demarker (for non-generative PRMs) or a correctness indicator (for generative PRMs)
device (str): device: The computational device to use ("cuda" for GPU, "mps" for Apple Silicon, or "cpu"), defaults to None. If not specified, the best available device will be automatically selected.
"""
super().__init__(model_name_or_path)

# auto-device if not more specific
self._device = device
if device is None:
device_name: str = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
assert device_name is not None
self._device = torch.device(device_name) # type: ignore

self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
self.model_name_or_path, torch_dtype=torch.bfloat16
)
self.model.to(self._device) # type: ignore
self.model.eval()
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)

self._score_token = score_token
self._score_token_id = self.tokenizer.encode(
self._score_token, add_special_tokens=False
)[0]

def stepify(self, content: str, step_separator: str) -> list[str]:
"""Splits the assistant response into steps to score

Args:
content: assistant response to score
step_separator: string on which to separate the content into steps
"""

# convert assistant message into a list of steps
list_of_steps = [
step.strip() for step in content.split(step_separator) if step.strip != ""
]
return list_of_steps
6 changes: 3 additions & 3 deletions mellea/backends/litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from collections.abc import Callable
from typing import Any

import litellm
import litellm.litellm_core_utils
import litellm.litellm_core_utils.get_supported_openai_params
import litellm # type: ignore
import litellm.litellm_core_utils # type: ignore
import litellm.litellm_core_utils.get_supported_openai_params # type: ignore

import mellea.backends.model_ids as model_ids
from mellea.backends import BaseModelSubclass
Expand Down
24 changes: 24 additions & 0 deletions mellea/backends/process_reward_models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Abstract interfaces for Backends that implement Process Reward Models (can be adapted to include other scorers)"""

import abc


class PRM(abc.ABC):
def __init__(self, model_name_or_path):
# Leave implementation of model to inheriting class
self.model_name_or_path = model_name_or_path

@abc.abstractmethod
def score(self, query: str, response: str) -> tuple[list[float], list[list[float]]]:
"""Returns a final score and per-step score to the input of the model"""
...

@abc.abstractmethod
def stepify(self, response: str, step_separator: str) -> list[str]:
"""Splits the assistant response into steps to score

Args:
response: assistant response to score
step_separator: string on which to separate the response into steps
"""
...
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Process Reward Model Implementations with Huggingface backends"""
Loading