Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions apps/vllm/judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""To run:
export HF_HUB_DISABLE_XET=1
python -m apps.vllm.judge --config apps/vllm/llama3_8b.yaml
"""

import asyncio

import os

from forge.actors.judge import EvaluationMode, Judge
from forge.cli.config import parse
from forge.controller.provisioner import shutdown

from forge.observability.metric_actors import get_or_create_metric_logger
from omegaconf import DictConfig

os.environ["HYPERACTOR_MESSAGE_DELIVERY_TIMEOUT_SECS"] = "600"
os.environ["HYPERACTOR_CODE_MAX_FRAME_LENGTH"] = "1073741824"


async def run(cfg: DictConfig):
metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
mlogger = await get_or_create_metric_logger()
await mlogger.init_backends.call_one(metric_logging_cfg)

prompt = "What is the capital of Japan?"
responses = ["Aardvark", "Durian", "Tokyo"]

print("Spawning service...")
judge = await Judge.options(**cfg.services.policy).as_service(**cfg.policy)

print(f"Prompt: {prompt}")
print(f"Responses: {responses}\n")
print("Evaluating responses...")
best_response_evaluations: list[str] = await judge.evaluate.route(
prompt=prompt, responses=responses, evaluation_mode=EvaluationMode.BEST_RESPONSE
)
response_check_evaluations: list[str] = await judge.evaluate.route(
prompt=prompt,
responses=responses,
evaluation_mode=EvaluationMode.RESPONSE_CHECK,
)

print("\nGeneration Results:")
print("=" * 80)
for batch, (best, fact) in enumerate(
zip(best_response_evaluations, response_check_evaluations)
):
print(f"Sample {batch + 1}")
print(f"Evaluation (BEST_RESPONSE): {best}")
print(f"Evaluation (RESPONSE_CHECK): {fact}")
print("-" * 80)

print("\nShutting down...")
await judge.shutdown()
await shutdown()


@parse
def recipe_main(cfg: DictConfig) -> None:
asyncio.run(run(cfg))


if __name__ == "__main__":
recipe_main()
214 changes: 214 additions & 0 deletions src/forge/actors/judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from dataclasses import dataclass
from enum import auto, Enum

from monarch.actor import endpoint

from forge.actors.policy import Policy
from forge.data_models.completion import Completion


class EvaluationMode(Enum):
"""Enum for selecting how a judge should evaluate the provided args"""

BEST_RESPONSE = auto()
RESPONSE_CHECK = auto()
MATH_CHECK = auto()


@dataclass
class Judge(Policy):
"""
`LLM-based Judges` are typically generative models which are then prompted
to evaluate responses. These models NEED prompt engineering to evaluate
and may require more postprocessing
"""

def _math_check(
self,
prompt: str,
responses: list[str],
ground_truth: None | str = None,
) -> str:
"""
Construct the generator input. Formats the request such that the generator
will return a comma separated list with a [[GOOD]] or [[BAD]] evaluation
for each response, corresponding to whether the model thinks the response
matches the provided ground_truth. Specifically the generator is prompted to
check for mathematical equivalence

Note: This is not a "good" prompt, it just demonstrates how to make one
"""

if ground_truth is None:
raise

system_prompt = f"""
You are a math professor. Given the prompt and ground truth solution, evaluate
each of the provided attempts and return whether the final solution is
numerically equivalent to the ground truth.

Each response is formatted as [Response #<N>], where <N> represents the
attempt.

Your answer should be a comma separated list of "[[GOOD]]" or "[[BAD]]",
corresponding to the same order as the reponses provided.

- If the answer is irrelevant to the prompt, return "[[BAD]]".
- If you are not confident that solution and attempt are equivalent, return "[[BAD]]"
- Only return "[[GOOD]]" if the attempt is numerically equivalent

Do not explain your reasoning, just provide your evaluations.
---
Here is the prompt that generated the responses: {prompt}.
---
Here is the ground truth: {ground_truth}
"""
response_str = "\n".join(
[f"[Response #{i+1}] {resp}" for i, resp in enumerate(responses)]
)
as_chat = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": response_str},
]
tokenizer = self.processor.tokenizer.tokenizer
formatted_request = tokenizer.apply_chat_template(
as_chat, tokenize=False, add_generation_prompt=True
)
return formatted_request

def _response_check(
self,
prompt: str,
responses: list[str],
ground_truth: None | str = None,
) -> str:
"""
Construct the generator input. Formats the request such that the generator
will return a comma separated list with a [[GOOD]] or [[BAD]] evaluation
for each response, corresponding to whether the model thinks it correct
answers the prompt.

Note: This is not a "good" prompt, it just demonstrates how to make one
"""

system_prompt = f"""
You are an expert fact checker. Given a prompt and response attempts, evaluate
each attempt and return whether it accurately answers the prompt.
Each response is formatted as [Response #<N>], where <N> represents the
attempt.

Your answer should be a comma separated list of "[[GOOD]]" or "[[BAD]]",
corresponding to the same order as the reponses provided.

- If the answer is irrelevant to the prompt, return "[[BAD]]".
- If you are not confident that the answer accurately answers the prompt, return "[[BAD]]"
- Only return "[[GOOD]]" if the attempt accurately answers the prompt

Do not explain your reasoning, just provide your evaluations.
Here is the prompt that generated the responses: {prompt}.
"""
response_str = "\n".join(
[f"[Response #{i+1}] {resp}" for i, resp in enumerate(responses)]
)
as_chat = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": response_str},
]
tokenizer = self.processor.tokenizer.tokenizer
formatted_request = tokenizer.apply_chat_template(
as_chat, tokenize=False, add_generation_prompt=True
)
return formatted_request

def _best_check(
self,
prompt: str,
responses: list[str],
ground_truth: None | str = None,
) -> str:
"""
Construct the generator input. Format the request such that the generator
will respond with a single integer corresponding to the response the model
thinks is most factually correct.

Note: This is not a "good" prompt, it just demonstrates how to make one
"""

system_prompt = f"""
You are an expert evaluator. Evaluate the responses provided and return
a single integer indicating which response is the most factually correct.
Each response is formatted as [Response #<N>], where <N> represents the
selection. Do not explain your reasoning, just provide a number.

Here is the prompt that generated the responses: {prompt}.
"""
response_str = "\n".join(
[f"[Response #{i+1}] {resp}" for i, resp in enumerate(responses)]
)
as_chat = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": response_str},
]
tokenizer = self.processor.tokenizer.tokenizer
formatted_request = tokenizer.apply_chat_template(
as_chat, tokenize=False, add_generation_prompt=True
)
return formatted_request

def _postprocess_output(self, outputs: list[Completion]) -> list[str]:
return [output.text for output in outputs]

@endpoint
async def evaluate(
self,
prompt: str,
responses: None | list[str] = None,
ground_truth: None | str = None,
evaluation_mode: EvaluationMode = EvaluationMode.BEST_RESPONSE,
) -> list[str]:
_prompting: dict = {
EvaluationMode.BEST_RESPONSE: self._best_check,
EvaluationMode.RESPONSE_CHECK: self._response_check,
EvaluationMode.MATH_CHECK: self._math_check,
}

wrapped_prompt: str = _prompting[evaluation_mode](
prompt, responses, ground_truth
)
response: List[Completion] = await self.generate._method(self, wrapped_prompt)
return self._postprocess_output(response)


@dataclass
class RewardModelJudge(Policy):
"""
`RewardModels` are typically discriminative models, post trained to
evaluate responses without further prompting required.
"""

# TODO: Add reward models formatting
def wrapped_prompt(
self, prompt: str, responses: list[str], ground_truth: None | str = None
) -> str:
return prompt

def _postprocess_output(
self, outputs: list[Completion], ground_truth: None | str = None
) -> list[str]:
return [output.text for output in outputs]

@endpoint
async def evaluate(
self,
prompt: str,
responses: list[str],
) -> list[str]:
wrapped_prompt: str = self._wrap_prompt(prompt, responses)
response: List[Completion] = await self.generate._method(self, wrapped_prompt)
return self._postprocess_output(response)
Loading