generative-computing
diff --git a/‎docs/examples/best_of_n/prm.py‎
Lines changed: 42 additions & 0 deletions b/‎docs/examples/best_of_n/prm.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎mellea/backends/huggingface.py‎
Lines changed: 54 additions & 0 deletions b/‎mellea/backends/huggingface.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎mellea/backends/litellm.py‎
Lines changed: 3 additions & 3 deletions b/‎mellea/backends/litellm.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎mellea/backends/process_reward_models/__init__.py‎
Lines changed: 24 additions & 0 deletions b/‎mellea/backends/process_reward_models/__init__.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎mellea/backends/process_reward_models/huggingface/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎mellea/backends/process_reward_models/huggingface/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,42 @@
+"""Example of Using Best of N with PRMs"""
+
+from docs.examples.helper import w
+from mellea import start_session
+from mellea.backends.process_reward_models.huggingface.prms import (
+    HFGenerativePRM,
+    HFRegressionPRM,
+)
+from mellea.backends.types import ModelOption
+from mellea.stdlib.rewards.prm_scorer import PRMScorer
+from mellea.stdlib.sampling import BestofNSamplingStrategy
+
+# create a session for the generator using Granite 3.3 8B on Huggingface and a simple context [see below]
+m = start_session(backend_name="hf", model_options={ModelOption.MAX_NEW_TOKENS: 512})
+
+# initialize the PRM model
+prm_model = HFGenerativePRM(
+    model_name_or_path="ibm-granite/granite-3.3-8b-lora-math-prm",
+    score_token="Y",
+    generation_prompt="Is this response correct so far (Y/N)?",
+    step_separator="\n\n",
+)
+
+# # can also initialize a Regression PRM model
+# prm_model = HFRegressionPRM(
+#     model_name_or_path = "granite-3.3-8b-math-prm-regression",
+#     score_token= "<end_of_step>",
+#     step_separator= "\n\n")
+
+# create PRM scorer object
+prm = PRMScorer(prm_model=prm_model, preference_ordering="max")
+
+# Do Best of N sampling with the PRM scorer and an additional requirement
+BoN_prm = m.instruct(
+    "Sarah has 12 apples. She gives 5 of them to her friend. How many apples does Sarah have left?",
+    strategy=BestofNSamplingStrategy(loop_budget=3),
+    model_options={"temperature": 0.9, "do_sample": True},
+    requirements=["provide final answer like 'Final Answer:'", prm],
+)
+
+# print result
+print(f"***** BoN ****\n{w(BoN_prm)}\n*******")
@@ -30,6 +30,7 @@
 from mellea.backends.cache import Cache, SimpleLRUCache
 from mellea.backends.formatter import Formatter, FormatterBackend, TemplateFormatter
 from mellea.backends.model_ids import ModelIdentifier
+from mellea.backends.process_reward_models import PRM
 from mellea.backends.tools import (
     add_tools_from_context_actions,
     add_tools_from_model_options,
@@ -672,3 +673,56 @@ def __init__(
         self._generation_prompt_tokens = self._backend._tokenizer(
             self._generation_prompt, return_tensors="pt"
         ).to(self._backend._device)
+
+
+class HFProcessRewardModel(PRM, abc.ABC):
+    def __init__(
+        self, model_name_or_path: str, score_token: str, device: str | None = None
+    ):
+        """Initialize an PRM that works with a huggingface backend. Currently supports and tested with IBM Process Reward Models
+
+        Args:
+            model_name_or_path (str): A local path to PRM or a huggingface PRM
+            score_token (str): token who's logits correspond to the PRM score. Can be a step demarker (for non-generative PRMs) or a correctness indicator (for generative PRMs)
+            device (str): device: The computational device to use ("cuda" for GPU, "mps" for Apple Silicon, or "cpu"), defaults to None. If not specified, the best available device will be automatically selected.
+        """
+        super().__init__(model_name_or_path)
+
+        # auto-device if not more specific
+        self._device = device
+        if device is None:
+            device_name: str = (
+                "cuda"
+                if torch.cuda.is_available()
+                else "mps"
+                if torch.backends.mps.is_available()
+                else "cpu"
+            )
+            assert device_name is not None
+            self._device = torch.device(device_name)  # type: ignore
+
+        self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
+            self.model_name_or_path, torch_dtype=torch.bfloat16
+        )
+        self.model.to(self._device)  # type: ignore
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
+
+        self._score_token = score_token
+        self._score_token_id = self.tokenizer.encode(
+            self._score_token, add_special_tokens=False
+        )[0]
+
+    def stepify(self, content: str, step_separator: str) -> list[str]:
+        """Splits the assistant response into steps to score
+
+        Args:
+            content: assistant response to score
+            step_separator: string on which to separate the content into steps
+        """
+
+        # convert assistant message into a list of steps
+        list_of_steps = [
+            step.strip() for step in content.split(step_separator) if step.strip != ""
+        ]
+        return list_of_steps
@@ -5,9 +5,9 @@
 from collections.abc import Callable
 from typing import Any
 
-import litellm
-import litellm.litellm_core_utils
-import litellm.litellm_core_utils.get_supported_openai_params
+import litellm  # type: ignore
+import litellm.litellm_core_utils  # type: ignore
+import litellm.litellm_core_utils.get_supported_openai_params  # type: ignore
 
 import mellea.backends.model_ids as model_ids
 from mellea.backends import BaseModelSubclass
 
@@ -0,0 +1,24 @@
+"""Abstract interfaces for Backends that implement Process Reward Models (can be adapted to include other scorers)"""
+
+import abc
+
+
+class PRM(abc.ABC):
+    def __init__(self, model_name_or_path):
+        # Leave implementation of model to inheriting class
+        self.model_name_or_path = model_name_or_path
+
+    @abc.abstractmethod
+    def score(self, query: str, response: str) -> tuple[list[float], list[list[float]]]:
+        """Returns a final score and per-step score to the input of the model"""
+        ...
+
+    @abc.abstractmethod
+    def stepify(self, response: str, step_separator: str) -> list[str]:
+        """Splits the assistant response into steps to score
+
+        Args:
+            response: assistant response to score
+            step_separator: string on which to separate the response into steps
+        """
+        ...
@@ -0,0 +1 @@
+"""Process Reward Model Implementations with Huggingface backends"""
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Process Reward Model Implementations with Huggingface backends"""`