diff --git a/docs/dev/intrinsics_and_adapters.md b/docs/dev/intrinsics_and_adapters.md new file mode 100644 index 00000000..a41c3e5b --- /dev/null +++ b/docs/dev/intrinsics_and_adapters.md @@ -0,0 +1,38 @@ +# Intrinsics and Adapters +Note: Mellea currently only supports GraniteCommonAdapters and Intrinsics. + +## Basics +In Mellea, intrinsics are a type of Component that signals one or more of the following to a backend: +- a special adapter must be used for generation +- the input/output for generation must be transformed in a particular way +- the model options must be modified in a particular way + +These changes only happen when the intrinsic is the "action" of the request. Intrinsics should usually not be used as an item in the context of generation (in fact, by default, Intrinsics have no string representation). + +These changes are specified by the Adapter that corresponds to a given Intrinsic. Matching happens based on the adapter name and type. + +## Parts of an Intrinsic +Intrinsics specify: +- an adapter name (ie requirement_check) +- types of adapters suitable to be used (ie alora) +- any kwargs necessary (ie a requirement like "make sure the last user message is...") + +## Parts of an Adapter +Adapters specify: +- compatible backends +- adapter type +- functions for getting a path to load them + +## Using Intrinsics +Mellea Intrinsics currently utilize the granite-common package for loading adapters and formatting input/outputs (https://github.com/ibm-granite/granite-common). This means Mellea only allows intrinsics/adapters that follow this pattern. + +## Needed Future Work +### Custom Adapters / Intrinsics +Mellea should support custom intrinsic / adapter implementations. To do this: +- make backend `_generate_from_intrinsic` functions generic and utilize only common adapter functions +- adapters must specify a transformation function that encapsulates the input/output modifications necessary for their generation requests + +### Concurrency Checks +Some backends (currently only LocalHFBackend) that allow adapters to be loaded, cannot independently utilize these adapters without impacting other generation requests. + +These backends should support a generation lock that ensures requests are only performed when the correct set of adapters (or no adapters) are active. diff --git a/docs/dev/requirement_aLoRA_rerouting.md b/docs/dev/requirement_aLoRA_rerouting.md index f7001df0..d21fb777 100644 --- a/docs/dev/requirement_aLoRA_rerouting.md +++ b/docs/dev/requirement_aLoRA_rerouting.md @@ -14,14 +14,14 @@ The actual rule is slightly more complicated. ## The Actual Rule -If a `Requirement` is validated using a backend that could either use a `constraint` aLoRA or perform an LLMaJ prompt on the underlying model, then the aLoRA is used for validation, even if the `backend.generate_from_context` method is called instead of the `alora.generate_from_strings` method. +If a `Requirement` is validated using a backend that could either use a `requirement_check` aLoRA or perform an LLMaJ prompt on the underlying model, then the aLoRA is used for validation, even if the `backend.generate_from_context` method is called instead of the `backend._generate_from_intrinsic` method. There are three exceptions to this rule: 1. `Backend.default_to_constraint_checking_alora` is set to `False` (this parameter defaults to `True`). 2. The `Requirement` has a more specific subtype that indicates a more specific intent (`LLMaJRequirement`). 3. The `ALoRA` requirement checker throws an exception. -There is an exception (or disambiguation) to the first exception: If the user provides an `ALoRARequirement`, then the `backend.generate_from_context` call is rerouted to the constraint checking LoRA, regardless of the value of `deault_to_constraint_checking_alora`. +There is an exception (or disambiguation) to the first exception: If the user provides an `ALoRARequirement`, then the `backend.generate_from_context` call is rerouted to the constraint checking LoRA, regardless of the value of `default_to_constraint_checking_alora`. ## Decision Rationale @@ -33,12 +33,13 @@ Suppose that the user creates a backend and then adds a generic constraint check ```python from mellea import start_session -from mellea.backends.aloras.granite_aloras import add_granite_aloras from mellea.stdlib.requirement import Requirement m = start_session( "huggingface.LocalHFBackend:ibm-granite/granite-3.2-8b-instruct") -add_granite_aloras(m) # This will load the Constraint checint aLoRA. + +# By default, the AloraRequirement uses a GraniteCommonAdapter with "requirement_check". +m.backend.add_adapter(GraniteCommonAdapter("requirement_check")) m.instruct( "Corporate wants you to find the difference between these two strings:\n\naaa\naba") diff --git a/docs/examples/intrinsics/intrinsics.py b/docs/examples/intrinsics/intrinsics.py new file mode 100644 index 00000000..98fd46af --- /dev/null +++ b/docs/examples/intrinsics/intrinsics.py @@ -0,0 +1,45 @@ +from mellea.backends.openai import OpenAIBackend, _ServerType +from mellea.backends.adapters.adapter import AdapterType, GraniteCommonAdapter +from mellea.stdlib.base import ChatContext, ModelOutputThunk +from mellea.stdlib.chat import Message +import mellea.stdlib.funcs as mfuncs +from mellea.stdlib.intrinsics.intrinsic import Intrinsic + +# Create the Adapter. GraniteCommonAdapter's default to ALORAs. +req_adapter = GraniteCommonAdapter("requirement_check") + +# Create the backend. Assumes a locally running VLLM server. +backend = OpenAIBackend( + model_id="ibm-granite/granite-3.3-8b-instruct", + base_url="http://0.0.0.0:8000/v1", + api_key="EMPTY", +) + +# If using a remote VLLM server, utilize the `test/backends/test_openai_vllm/serve.sh` +# script with `export VLLM_DOWNLOAD_RAG_INTRINSICS=True`. This will download the granite_common +# adapters on the server. +backend._server_type = _ServerType.REMOTE_VLLM + +# Add the adapter to the backend. +backend.add_adapter(req_adapter) + +ctx = ChatContext() +ctx = ctx.add(Message("user", "Hi, can you help me?")) +ctx = ctx.add(Message("assistant", "Hello; yes! What can I help with?")) + +# Generate from an intrinsic with the same name as the adapter. By default, it will look for +# ALORA and then LORA adapters. +out, new_ctx = mfuncs.act( + Intrinsic( + "requirement_check", + intrinsic_kwargs={"requirement": "The assistant is helpful."}, + ), + ctx, + backend, +) + +# Print the output. The requirement_check adapter has a specific output format: +print(out) # {"requirement_likelihood": 1.0} + +# The AloraRequirement uses this adapter. It automatically parses that output +# when validating the output. diff --git a/mellea/backends/_utils.py b/mellea/backends/_utils.py index c6e90ba8..08720bc0 100644 --- a/mellea/backends/_utils.py +++ b/mellea/backends/_utils.py @@ -4,7 +4,6 @@ from collections.abc import Callable from typing import Any, Literal -from mellea.backends.aloras import Alora from mellea.backends.formatter import Formatter from mellea.backends.tools import parse_tools from mellea.helpers.fancy_logger import FancyLogger @@ -57,30 +56,6 @@ def to_chat( return ctx_as_conversation -def use_alora( - action: Component | CBlock, - alora: Alora | None, - default_to_constraint_checking_alora: bool, -) -> bool: - """Returns True when the condition for using alora is met. - - See `docs/dev/requirement_aLoRA_rerouting.md` for an explanation of the following code block. - """ - if issubclass(type(action), Requirement): - # The general rule is that we reroute to the alora if it exists. - reroute_to_alora = alora is not None - # However, there are some exceptions: - if not default_to_constraint_checking_alora: - reroute_to_alora = False - if issubclass(type(action), LLMaJRequirement): - reroute_to_alora = False - if issubclass(type(action), ALoraRequirement): - reroute_to_alora = True - return reroute_to_alora - else: - return False - - def to_tool_calls( tools: dict[str, Callable], decoded_result: str ) -> dict[str, ModelToolCall] | None: diff --git a/mellea/backends/adapters/adapter.py b/mellea/backends/adapters/adapter.py new file mode 100644 index 00000000..7b723661 --- /dev/null +++ b/mellea/backends/adapters/adapter.py @@ -0,0 +1,224 @@ +"""Module for adapters to backends.""" + +import abc +import pathlib +from enum import Enum +from typing import Any, TypeVar + +import granite_common +from litellm import cast + +from mellea.backends import Backend +from mellea.backends.types import _ServerType + + +class AdapterType(Enum): + """Possible types of adapters for a backend.""" + + LORA = "lora" + ALORA = "alora" + + +class Adapter(abc.ABC): + """An adapter that can be added to a single backend.""" + + def __init__(self, name: str, adapter_type: AdapterType): + """An adapter that can be added to a backend. + + Note: An adapter can only be added to a single backend. + + Args: + name: name of the adapter; when referencing this adapter, use adapter.qualified_name + adapter_type: enum describing what type of adapter it is (ie LORA / ALORA) + """ + self.name = name + self.adapter_type = adapter_type + self.qualified_name = name + "_" + adapter_type.value + """the name of the adapter to use when loading / looking it up""" + + self.backend: Backend | None = None + """set when the adapter is added to a backend""" + + self.path: str | None = None + """set when the adapter is added to a backend""" + + +class OpenAIAdapter(Adapter): + """Adapter for OpenAIBackends.""" + + @abc.abstractmethod + def get_open_ai_path( + self, + base_model_name: str, + server_type: _ServerType = _ServerType.LOCALHOST, + remote_path: str | None = None, + ) -> str: + """Returns the path needed to load the adapter. + + Args: + base_model_name: the base model; typically the last part of the huggingface model id like "granite-3.3-8b-instruct" + server_type: the server type (ie LOCALHOST / OPENAI); usually the backend has information on this + remote_path: optional; used only if the server_type is REMOTE_VLLM; base path at which to find the adapter + """ + ... + + +class LocalHFAdapter(Adapter): + """Adapter for LocalHFBackends.""" + + @abc.abstractmethod + def get_local_hf_path(self, base_model_name: str) -> str: + """Returns the path needed to load the adapter. + + Args: + base_model_name: the base model; typically the last part of the huggingface model id like "granite-3.3-8b-instruct" + """ + ... + + +class GraniteCommonAdapter(OpenAIAdapter, LocalHFAdapter): + """Adapter for intrinsics that utilize the GraniteCommon library.""" + + def __init__( + self, + name: str, + adapter_type: AdapterType = AdapterType.ALORA, + config_file: str | pathlib.Path | None = None, + config_dict: dict | None = None, + base_model_name: str | None = None, + ): + """An adapter that can be added to either an `OpenAIBackend` or a `LocalHFBackend`. Most rag-lib-intrinsics support lora or alora adapter types. + + Args: + name: name of the adapter; when referencing this adapter, use adapter.qualified_name + adapter_type: enum describing what type of adapter it is (ie LORA / ALORA) + config_file: optional; file for defining the intrinsic / transformations + config_dict: optional; dict for defining the intrinsic / transformations + base_model_name: optional; if provided with no config_file/config_dict, will be used to lookup the granite_common config for this adapter + """ + assert adapter_type == AdapterType.ALORA or adapter_type == AdapterType.LORA, ( + f"{adapter_type} not supported" + ) + super().__init__(name, adapter_type) + + self.base_model_name = base_model_name + + # If any of the optional params are specified, attempt to set up the + # config for the intrinsic here. + config: dict | None = None + if config_file is not None or config_dict is not None: + config = granite_common.intrinsics.util.make_config_dict( + config_file=config_file, config_dict=config_dict + ) + config = cast( + dict, config + ) # Can remove if util function gets exported properly. + + if config is None and self.base_model_name is not None: + is_alora = True if self.adapter_type == AdapterType.ALORA else False + io_yaml_file = granite_common.intrinsics.util.obtain_io_yaml( + self.name, self.base_model_name, alora=is_alora + ) + config = granite_common.intrinsics.util.make_config_dict( + config_file=io_yaml_file + ) + config = cast( + dict, config + ) # Can remove if util function gets exported properly. + + self.config: dict | None = config + + def get_open_ai_path( + self, + base_model_name: str, + server_type: _ServerType = _ServerType.LOCALHOST, + remote_path: str | None = None, + ) -> str: + """Returns the path needed to load the adapter. + + Args: + base_model_name: the base model; typically the last part of the huggingface model id like "granite-3.3-8b-instruct" + server_type: the server type (ie LOCALHOST / OPENAI); usually the backend has information on this + remote_path: optional; used only if the server_type is REMOTE_VLLM; base path at which to find the adapter + """ + if server_type == _ServerType.LOCALHOST: + path = self.download_and_get_path(base_model_name) + elif server_type == _ServerType.REMOTE_VLLM: + if remote_path is None: + remote_path = "rag-intrinsics-lib" + path = self.get_path_on_remote(base_model_name, remote_path) + else: + raise ValueError( + f"{self} not supported for OpenAIBackend with server_type: {server_type}" + ) + + return path + + def get_local_hf_path(self, base_model_name: str) -> str: + """Returns the path needed to load the adapter. + + Args: + base_model_name: the base model; typically the last part of the huggingface model id like "granite-3.3-8b-instruct" + """ + return self.download_and_get_path(base_model_name) + + def download_and_get_path(self, base_model_name: str) -> str: + """Downloads the required rag intrinsics files if necessary and returns the path to the them. + + Args: + base_model_name: the base model; typically the last part of the huggingface model id like "granite-3.3-8b-instruct" + + Returns: + a path to the files + """ + is_alora = self.adapter_type == AdapterType.ALORA + return str( + granite_common.intrinsics.util.obtain_lora( + self.name, base_model_name, alora=is_alora + ) + ) + + def get_path_on_remote(self, base_model_name: str, base_path: str) -> str: + """Assumes the files have already been downloaded on the remote server.""" + return f"./{base_path}/{self.name}/{self.adapter_type.value}/{base_model_name}" + + +T = TypeVar("T") + + +def get_adapter_for_intrinsic( + intrinsic_name: str, + intrinsic_adapter_types: list[AdapterType], + available_adapters: dict[str, T], +) -> T | None: + """Finds an adapter from a dict of available adapters based on the intrinsic name and its allowed adapter types. + + Args: + intrinsic_name: the name of the intrinsic, like "answerability" + intrinsic_adapter_types: the adapter types allowed for this intrinsic, like ALORA / LORA + available_adapters: the available adapters to choose from; maps adapter.qualified_name to the Adapter + + Returns: + an Adapter if found; else None + """ + adapter = None + for adapter_type in intrinsic_adapter_types: + qualified_name = intrinsic_name + "_" + adapter_type.value + adapter = available_adapters.get(qualified_name, None) + if adapter is not None: + break + + return adapter + + +class AdapterMixin(abc.ABC): + """Mixin class for backends capable of utilizing adapters.""" + + def add_adapter(self, *args, **kwargs): + """Adds the given adapter to the backend. Must not have been added to a different backend.""" + + def load_adapter(self, adapter_qualified_name: str): + """Loads the given adapter for the backend. Must have previously been added.""" + + def unload_adapter(self, adapter_qualified_name: str): + """Unloads the given adapter from the backend.""" diff --git a/mellea/backends/aloras/__init__.py b/mellea/backends/aloras/__init__.py deleted file mode 100644 index ae7b37b2..00000000 --- a/mellea/backends/aloras/__init__.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Abstract interfaces for Backends that implement Activated LoRAs.""" - -import abc - -from mellea.stdlib.base import CBlock, ModelOutputThunk - - -class Alora(abc.ABC): - """Activated LoRAs (Aloras)](https://arxiv.org/pdf/2504.12397) are are [low-rank adapters](https://arxiv.org/abs/2106.09685) that can reuse KV cache from their underlying model. - - This class should not be directly subclassed by a specific ALora. Each backend that supports ALora should provide a backend-specific abstract class that subclasses `ALora`. Individual ALoras should then be defined by subclassing the model-specific backend. - - ALoras are always attached to an underlying model and use the following calling convention: - 1. The underlying model is prompted (without the Alora active). We call this the `input`. - 2. The underlying model generates some tokens from the `input` context (again, without the ALora active). We call this the `response`. - 3. Then the adapter is activated and generates some tokens. We call then the `alora_response`. - - Args: - name (str): An arbitrary name/label in the model serving engine (e.g. vllm, or local huggingface) to assign to an ALora. This is irrelevant from the alora's (huggingface) model id. - """ - - def __init__(self, name: str): - """Each aLoRA is identified by a name.""" - self.name: str = name - - @abc.abstractmethod - def generate_using_strings(self, *args, **kwargs) -> ModelOutputThunk: - """Generates from the ALora using raw strings as the interface for inputs. In most cases, must be run from a running event loop. - - This has a generic signature because each aLoRA has different parameters depending on its functionality and how it gets called. - """ - - def generate_using_stdlib(self, *args, **kwargs) -> CBlock: - """Generates from the Alora using Span-based backends.""" - # This is NOT marked as an `abc.abstractmethod` for now because we are not releasing span-based backends. When we release a span-based backend, we should mark this method as `abc.abstractmethod`""" - raise NotImplementedError( - "There are not currently ant ALoras trained to use spans." - ) - - -class AloraBackendMixin(abc.ABC): - """Mixin class for backends capable of aLoRA functionality.""" - - @abc.abstractmethod - def add_alora(self, *args, **kwargs): - """Loads an ALora.""" - ... - - @abc.abstractmethod - def get_alora(self, alora_name: str) -> Alora | None: - """Returns the ALora by name, or None of that ALora isn't loaded.""" - ... - - @abc.abstractmethod - def get_aloras(self) -> list[Alora]: - """Returns a list of all loaded aLoRA adapters.""" - ... diff --git a/mellea/backends/aloras/huggingface/__init__.py b/mellea/backends/aloras/huggingface/__init__.py deleted file mode 100644 index 6746ef50..00000000 --- a/mellea/backends/aloras/huggingface/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""ALora implementations for `mellea.backends.huggingface` backends.""" diff --git a/mellea/backends/aloras/huggingface/granite_aloras.py b/mellea/backends/aloras/huggingface/granite_aloras.py deleted file mode 100644 index b5e29a47..00000000 --- a/mellea/backends/aloras/huggingface/granite_aloras.py +++ /dev/null @@ -1,285 +0,0 @@ -"""Huggingface implementations for IBM's "starter pack" of Activated LoRAs.""" - -import asyncio -import functools -from copy import deepcopy - -import torch -from transformers.generation.utils import GenerateDecoderOnlyOutput - -from mellea.backends.huggingface import HFAlora, HFAloraCacheInfo, LocalHFBackend -from mellea.backends.types import ModelOption -from mellea.helpers.async_helpers import send_to_queue -from mellea.helpers.fancy_logger import FancyLogger -from mellea.stdlib.base import GenerateType, ModelOutputThunk - - -class HFConstraintAlora(HFAlora): - """The Requirement Checking ALora for Granite checks if the specified requirement was satisfied by the most recent model generation. Only one requirement is checked at a time. - - Currently supports [Granite 3.2 8B](https://huggingface.co/ibm-granite/granite-3.2-8b-alora-requirement-check) and [Granite 3.3 8B](https://huggingface.co/ibm-granite/granite-3.3-8b-alora-requirement-check) by default. - """ - - def __init__( - self, - name: str, - path_or_model_id: str, - generation_prompt: str, - backend: LocalHFBackend, - *, - constraint_prompt: str | None = None, - include_constraint_in_alora_offset: bool = False, - ): - """Initialize after checking that the backend is correct. - - Args: - name: name of the alora. - path_or_model_id: huggingface path or model id. - generation_prompt: the prompt required to activate the aLoRa. - backend: a LocalHFBackend that this alora is attached to. - constraint_prompt: a template that the constraint can be interpolated into; can only have a single `{}` slot. - include_constraint_in_alora_offset: whether to include the constraint prompt in the alora offset. - """ - super().__init__(name, path_or_model_id, generation_prompt, backend) - - # Maintain default behavior. - if constraint_prompt is None: - constraint_prompt = "\nRequirement: {}<|end_of_text|>\n" - - self._constraint_prompt = constraint_prompt - self._include_constraint_in_alora_offset = include_constraint_in_alora_offset - - # We do a lot of logging for ALoras because this is an experimental feature. Maybe we should tag these log messages? - self._logger = FancyLogger.get_logger() - - def generate_using_strings( - self, - input: str, - response: str, - constraint: str, - force_yn: bool = True, - stream: bool = False, - ) -> ModelOutputThunk: - """Generates a constraint response from the ALora. Must be run in a running event loop.""" - assert self._backend.alora_model is not None - # Go ahead and do runtime type-checking because passing CBlocks into this function is a common error. - assert type(input) is str - assert type(response) is str - assert type(constraint) is str - self._backend.alora_model.set_adapter(self.name) - cache_hit = self._backend.cache_get(response) - - if stream: - self._logger.warning( - "`HFConstraintAlora` cannot stream output; defaulting to non-streaming approach." - ) - - generate_kwargs = {} - if cache_hit: - self._logger.debug( - f"using cache for alora {self.__class__} and response '{response}'" - ) - generate_kwargs["past_key_values"] = deepcopy(cache_hit.kv_cache) - input_combined = self._generate_using_cache(cache_hit, constraint, force_yn) - - else: - self._logger.debug( - f"not using cache for alora {self.__class__} and response '{response}'" - ) - input_combined = self._generate_not_using_cache( - input, response, constraint, force_yn - ) - - if not self._include_constraint_in_alora_offset: - alora_offsets = [self._generation_prompt_tokens["input_ids"].shape[1] - 1] - else: - # Get the constraint tokens separately so that we can calculate the alora offsets. - constraint_tokens = self._backend._tokenizer( - self._constraint_prompt.format(constraint), return_tensors="pt" - ).to(self._backend._device) - - alora_offsets = [ - constraint_tokens["input_ids"].shape[1] - + self._generation_prompt_tokens["input_ids"].shape[1] - - 2 - ] - - chat_response = asyncio.to_thread( - self._backend.alora_model.generate, - input_combined["input_ids"].to(self._backend._device), - attention_mask=input_combined["attention_mask"].to(self._backend._device), - max_new_tokens=1, - return_dict_in_generate=True, - alora_offsets=alora_offsets, - output_scores=True, - **generate_kwargs, - ) - - output = ModelOutputThunk(None) - output._meta["alora_name"] = self.name - - output._process = functools.partial( - processing, - backend=self._backend, - force_yn=force_yn, - gen_prompt=self._generation_prompt, - ) - output._post_process = functools.partial(post_processing, backend=self._backend) - - try: - # To support lazy computation, will need to remove this create_task and store just the unexecuted coroutine. - # We can also support synchronous calls by adding a flag and changing this ._generate function. - - # This function should always be called from a running event loop so we don't have to worry about - # scheduling the task to a specific event loop here. - output._generate = asyncio.create_task( - send_to_queue(chat_response, output._async_queue) # type: ignore - ) - output._generate_type = GenerateType.ASYNC - except RuntimeError as e: - # Most likely cause is running this function without an event loop present. - raise e - - return output - - def _generate_using_cache( - self, cache_hit: HFAloraCacheInfo, constraint: str, force_yn: bool - ) -> dict: - """Returns the input object used for generation.""" - # Must tokenize the constraint here since the requirement isn't known at initialization. - constraint_tokens = self._backend._tokenizer( - self._constraint_prompt.format(constraint), return_tensors="pt" - ).to(self._backend._device) - - input_combined = { - "input_ids": torch.cat( - [ - cache_hit.merged_token_ids.unsqueeze(0), - constraint_tokens["input_ids"], - self._generation_prompt_tokens["input_ids"], - ], - dim=1, - ), - "attention_mask": torch.cat( - [ - cache_hit.merged_attention.unsqueeze(0), - constraint_tokens["attention_mask"], - self._generation_prompt_tokens["attention_mask"], - ], - dim=1, - ), - } - - self._logger.debug( - f"Prompt for cached aLoRA({self.name}):\n {self._backend._tokenizer.decode(input_combined['input_ids'][0])}" - ) - - return input_combined - - def _generate_not_using_cache( - self, input: str, response: str, constraint: str, force_yn: bool - ) -> dict: - """Returns the input object used for generation.""" - # Params aren't needed when just getting the backend args. - backend_model_opts = self._backend._simplify_and_merge(None) - sys_prompt = backend_model_opts.get(ModelOption.SYSTEM_PROMPT, None) - - chat = [ - *([{"role": "system", "content": sys_prompt}] if sys_prompt else []), - {"role": "user", "content": input}, - {"role": "assistant", "content": response}, - ] - - templatized = self._backend._tokenizer.apply_chat_template(chat, tokenize=False) - assert type(templatized) is str - - # Must tokenize the constraint here since the requirement isn't known at initialization. - templatized = templatized + self._constraint_prompt.format(constraint) - - tokenized = self._backend._tokenizer(templatized, return_tensors="pt").to( - self._backend._device - ) - - input_combined = { - "input_ids": torch.cat( - [tokenized["input_ids"], self._generation_prompt_tokens["input_ids"]], - dim=1, - ), - "attention_mask": torch.cat( - [ - tokenized["attention_mask"], - self._generation_prompt_tokens["attention_mask"], - ], - dim=1, - ), - } - - self._logger.debug( - f"Prompt for non-cached aLoRA({self.name}):\n{self._backend._tokenizer.decode(input_combined['input_ids'][0])}" - ) - - return input_combined - - -async def processing( - mot: ModelOutputThunk, - chunk: GenerateDecoderOnlyOutput, - backend: LocalHFBackend, - force_yn: bool, - gen_prompt: str, -): - """Called to process the incoming chunks.""" - if mot._underlying_value is None: - mot._underlying_value = "" - - # Don't support async for HFConstraintAlora. Means we can process the output here. - assert isinstance(chunk, GenerateDecoderOnlyOutput) - - if force_yn: - last_logits = chunk.scores[-1].squeeze(0) # type: ignore - token_Y = backend._tokenizer("Y", add_special_tokens=False)["input_ids"][0] # type: ignore - token_N = backend._tokenizer("N", add_special_tokens=False)["input_ids"][0] # type: ignore - logit_Y = last_logits[token_Y].item() - logit_N = last_logits[token_N].item() - mot._underlying_value = "Y" if logit_Y > logit_N else "N" - else: - output_text = backend._tokenizer.decode(chunk.sequences[0]) - constraint_satisfied = output_text.split(gen_prompt)[-1] - mot._underlying_value = constraint_satisfied[ - 0 - ] # Grab the first char of the str. - - -async def post_processing(mot: ModelOutputThunk, backend: LocalHFBackend): - """Called after all data has been received.""" - backend.formatter.parse(mot._action, mot) # type: ignore - - -def add_granite_aloras(backend: LocalHFBackend): - """Adds the IBM Granite "starter pack" ALoras to a backend.""" - if backend._hf_model_id == "ibm-granite/granite-3.2-8b-instruct": - backend.add_alora( - HFConstraintAlora( - name="constraint", - path_or_model_id="ibm-granite/granite-3.2-8b-alora-requirement-check", - generation_prompt="<|start_of_role|>check_requirement<|end_of_role|>", - backend=backend, - constraint_prompt="\nRequirement: {}<|end_of_text|>\n", - include_constraint_in_alora_offset=False, - ) - ) - elif backend._hf_model_id == "ibm-granite/granite-3.3-8b-instruct": - backend.add_alora( - HFConstraintAlora( - name="constraint", - path_or_model_id="ibm-granite/granite-3.3-8b-alora-requirement-check", - generation_prompt="<|start_of_role|>check_requirement<|end_of_role|>", - backend=backend, - constraint_prompt="\n<|start_of_role|>requirement<|end_of_role|>{}<|end_of_text|>\n", - include_constraint_in_alora_offset=True, - ) - ) - else: - raise ValueError( - f"cannot add_granite_aloras to unknown huggingface model_id / backend: {backend._hf_model_id}" - ) diff --git a/mellea/backends/aloras/openai/__init__.py b/mellea/backends/aloras/openai/__init__.py deleted file mode 100644 index f07c7f75..00000000 --- a/mellea/backends/aloras/openai/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""ALora implementations for `mellea.backends.openai` backends.""" diff --git a/mellea/backends/aloras/openai/granite_aloras.py b/mellea/backends/aloras/openai/granite_aloras.py deleted file mode 100644 index 6d1b2c6b..00000000 --- a/mellea/backends/aloras/openai/granite_aloras.py +++ /dev/null @@ -1,128 +0,0 @@ -"""OpenAI implementations for IBM's "starter pack" of Activated LoRAs.""" - -import asyncio -import functools -from collections.abc import Coroutine -from typing import Any - -import openai -from openai.types.completion import Completion - -from mellea.backends.aloras import Alora -from mellea.backends.openai import OpenAIAlora, OpenAIBackend -from mellea.backends.types import ModelOption -from mellea.helpers.async_helpers import send_to_queue -from mellea.helpers.fancy_logger import FancyLogger -from mellea.stdlib.base import GenerateType, ModelOutputThunk - - -class OpenAIConstraintAlora(OpenAIAlora): - """The [Requirement Checking ALora for Granite 3.2 8B](https://huggingface.co/ibm-granite/granite-3.2-8b-alora-requirement-check) checks if the specified requirement was satisfied by the most recent model generation. Only one requirement is checked at a time.""" - - def __init__( - self, name: str, path: str, generation_prompt: str, backend: OpenAIBackend - ): - """Initialize after checking that the backend is correct.""" - assert backend._hf_model_id == "ibm-granite/granite-3.2-8b-instruct" - super().__init__(name, path, generation_prompt, backend) - # We do a lot of logging for ALoras because this is an experimental feature. Maybe we should tag these log messages? - self._logger = FancyLogger.get_logger() - - def generate_using_strings( - self, - input: str, - response: str, - constraint: str, - force_yn: bool = True, - stream: bool = False, - ) -> ModelOutputThunk: - """Generates a constraint response from the ALora. Must be run in a running event loop.""" - # Go ahead and do runtime type-checking because passing CBlocks into this function is a common error. - assert type(input) is str - assert type(response) is str - assert type(constraint) is str - - # Params aren't needed when just getting the backend args. - backend_model_opts = self._backend._simplify_and_merge(None, False) - sys_prompt = backend_model_opts.get(ModelOption.SYSTEM_PROMPT, None) - - chat = [ - *([{"role": "system", "content": sys_prompt}] if sys_prompt else []), - {"role": "user", "content": input}, - {"role": "assistant", "content": response}, - ] - - prompt = self._backend.apply_chat_template(chat) - prompt += f"\nRequirement: {constraint}<|end_of_text|>\n" # type: ignore - prompt += self._generation_prompt - - self._logger.debug(f"Prompt for non-cached aLoRA({self.name}):\n{prompt}") - - force_yn_args = {} - if force_yn: - assert hasattr(self._backend, "_tokenizer") - token_Y = self._backend._tokenizer("Y", add_special_tokens=False)[ - "input_ids" - ][0] # type: ignore - token_N = self._backend._tokenizer("N", add_special_tokens=False)[ - "input_ids" - ][0] # type: ignore - - force_yn_args["logit_bias"] = {str(token_Y): 100, str(token_N): 100} - - chat_response: Coroutine[ - Any, Any, openai.AsyncStream[Completion] | Completion - ] = self._backend._async_client.completions.create( - model=self.name, - prompt=prompt, - max_tokens=1, - n=1, - stream=stream, - **force_yn_args, - ) # type: ignore - - output = ModelOutputThunk(None) - output._meta["alora_name"] = self.name - - output._process = processing - output._post_process = functools.partial(post_processing, self._backend) - - try: - # To support lazy computation, will need to remove this create_task and store just the unexecuted coroutine. - # We can also support synchronous calls by adding a flag and changing this ._generate function. - - # This function should always be called from a running event loop so we don't have to worry about - # scheduling the task to a specific event loop here. - output._generate = asyncio.create_task( - send_to_queue(chat_response, output._async_queue) - ) - output._generate_type = GenerateType.ASYNC - except RuntimeError as e: - # Most likely cause is running this function without an event loop present - raise e - - return output - - -async def processing(mot: ModelOutputThunk, chunk: Completion): - """Called to process the incoming chunks.""" - if mot._underlying_value is None: - mot._underlying_value = "" - mot._underlying_value += chunk.choices[0].text - - -async def post_processing(backend: OpenAIBackend, mot: ModelOutputThunk): - """Called after all data has been received.""" - backend.formatter.parse(mot._action, mot) # type: ignore - - -def add_granite_aloras(backend: OpenAIBackend): - """Adds the IBM Granite "starter pack" ALoras to a backend.""" - backend.add_alora( - OpenAIConstraintAlora( - name="constraint", - path="ibm-granite/granite-3.2-8b-alora-requirement-check", - generation_prompt="<|start_of_role|>check_requirement<|end_of_role|>", - backend=backend, - ) - ) diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py index c5d9b0db..d90d941f 100644 --- a/mellea/backends/huggingface.py +++ b/mellea/backends/huggingface.py @@ -13,10 +13,13 @@ import inspect import json from collections.abc import Callable, Coroutine -from typing import TYPE_CHECKING, Any +from copy import deepcopy +from typing import TYPE_CHECKING, Any, cast +import granite_common import outlines import outlines_core +import peft import torch from transformers import ( AsyncTextIteratorStreamer, @@ -30,11 +33,18 @@ from transformers.generation.utils import GenerateDecoderOnlyOutput from mellea.backends import BaseModelSubclass -from mellea.backends._utils import to_chat, to_tool_calls, use_alora -from mellea.backends.aloras import Alora, AloraBackendMixin +from mellea.backends._utils import to_chat, to_tool_calls +from mellea.backends.adapters.adapter import ( + AdapterMixin, + AdapterType, + GraniteCommonAdapter, + LocalHFAdapter, + get_adapter_for_intrinsic, +) from mellea.backends.cache import Cache, SimpleLRUCache from mellea.backends.formatter import Formatter, FormatterBackend, TemplateFormatter from mellea.backends.model_ids import ModelIdentifier +from mellea.backends.openai import OpenAIBackend from mellea.backends.process_reward_models import PRM from mellea.backends.tools import ( add_tools_from_context_actions, @@ -54,11 +64,9 @@ ModelToolCall, ) from mellea.stdlib.chat import Message +from mellea.stdlib.intrinsics.intrinsic import Intrinsic from mellea.stdlib.requirement import ALoraRequirement, LLMaJRequirement, Requirement -if TYPE_CHECKING: - from alora.peft_model_alora import aLoRAPeftModelForCausalLM # type: ignore - assert outlines, "outlines needs to be present to make outlines_core work" """A configuration type for the unhappy path: Tokenizer * Model * torch device string @@ -80,7 +88,7 @@ class HFAloraCacheInfo: q_end: int = -1 -class LocalHFBackend(FormatterBackend, AloraBackendMixin): +class LocalHFBackend(FormatterBackend, AdapterMixin): """The LocalHFBackend uses Huggingface's transformers library for inference, and uses a Formatter to convert `Component`s into prompts. This backend also supports Activated LoRAs (ALoras)](https://arxiv.org/pdf/2504.12397). This backend is designed for running an HF model for small-scale inference locally on your machine. @@ -148,42 +156,22 @@ def __init__( self._hf_model_id = model_id.hf_model_name match custom_config: case None: - # Choose a device. - self._device = torch.device( - "cuda" - if torch.cuda.is_available() - else "mps" - if torch.backends.mps.is_available() - else "cpu" - ) # Get the model and tokenizer. self._model: PreTrainedModel = AutoModelForCausalLM.from_pretrained( self._hf_model_id - ).to(self._device) # type: ignore + ) self._tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( self._hf_model_id ) case _: - self._tokenizer, self._model, self._device = custom_config + self._tokenizer, self._model, _ = custom_config self._use_caches = use_caches self._cache = cache if cache is not None else SimpleLRUCache(3) - # Used when running aLoRAs with this backend. - self._alora_model: "aLoRAPeftModelForCausalLM | None" = None # noqa: UP037 - # ALoras that have been loaded for this model. - self._aloras: dict[str, HFAlora] = {} - - @property - def alora_model(self) -> "aLoRAPeftModelForCausalLM | None": # noqa: UP037 - """The ALora model.""" - return self._alora_model - - @alora_model.setter - def alora_model(self, model: "aLoRAPeftModelForCausalLM | None"): # noqa: UP037 - """Sets the ALora model. This should only happen once in a backend's lifetime.""" - assert self._alora_model is None - self._alora_model = model + # Adapters can be made know to the backend (added) and loaded. + self._added_adapters: dict[str, LocalHFAdapter] = {} + self._loaded_adapters: dict[str, LocalHFAdapter] = {} def generate_from_context( self, @@ -198,71 +186,212 @@ def generate_from_context( # Upsert model options. model_opts = self._simplify_and_merge(model_options) - if use_alora( - action, - self.get_alora("constraint"), - self.default_to_constraint_checking_alora, - ): - mot = self._generate_from_context_alora( - action, ctx, _format=format, model_options=model_opts - ) - return mot, ctx.add(mot) - else: - mot = self._generate_from_context_standard( - action, - ctx, - _format=format, - model_options=model_opts, - tool_calls=tool_calls, + # Requirements can be automatically rerouted to a requirement adapter. + if isinstance(action, Requirement): + # See docs/dev/requirement_aLoRA_rerouting.md + reroute_to_alora = self.default_to_constraint_checking_alora + adapter_name = "requirement_check" + + if isinstance(action, ALoraRequirement): + reroute_to_alora = True + adapter_name = action.intrinsic_name + alora_action = action + else: + assert action.description is not None, ( + "must have a description when generating from a requirement" + ) + alora_action = ALoraRequirement(action.description, adapter_name) + + # Check if a requirement_check (or AloraRequirement specified) adapter exists. + alora_req_adapter = get_adapter_for_intrinsic( + adapter_name, [AdapterType.ALORA], self._added_adapters ) + if alora_req_adapter is None: + # Log a warning if using an AloraRequirement but no adapter fit. + if reroute_to_alora: + FancyLogger.get_logger().warning( + f"attempted to use an AloraRequirement but backend {self} doesn't have the specified adapter added {adapter_name}; defaulting to regular generation" + ) + + if issubclass(type(action), LLMaJRequirement): + reroute_to_alora = False + + if reroute_to_alora: + # Keep the alora requirement handling separate for now. + mot = self._generate_from_intrinsic( + alora_action, ctx, model_options=model_opts + ) + return mot, ctx.add(alora_action).add(mot) + + elif isinstance(action, Intrinsic): + mot = self._generate_from_intrinsic(action, ctx, model_options=model_opts) return mot, ctx.add(action).add(mot) - def _generate_from_context_alora( - self, - action: Component | CBlock, - ctx: Context, - *, - _format: type[BaseModelSubclass] | None = None, - model_options: dict[str, Any], + mot = self._generate_from_context_standard( + action, ctx, _format=format, model_options=model_opts, tool_calls=tool_calls + ) + return mot, ctx.add(action).add(mot) + + def _generate_from_intrinsic( + self, action: Intrinsic, ctx: Context, *, model_options: dict[str, Any] ) -> ModelOutputThunk: - match action: - case ALoraRequirement(): - alora_for_this_request = ( - self.get_alora("constraint") - if action.alora is None - else action.alora - ) - case _: - alora_for_this_request = self.get_alora("constraint") - assert alora_for_this_request is not None, ( - "This code block should not execute unless there is a 'constraint' alora loaded." - ) - # Construct the linearized context. This is very similar to normal generation. + if not ctx.is_chat_context: + raise Exception("Does not yet support non-chat contexts.") + linearized_ctx = ctx.view_for_generation() - assert linearized_ctx is not None and len(linearized_ctx) > 1 - msgs = self.formatter.to_chat_messages(linearized_ctx) - user_message, assistant_message = msgs[-2].content, msgs[-1].content - assert alora_for_this_request is not None - assert type(user_message) is str - assert type(assistant_message) is str - assert _format is None, "Structured outputs are not supported by ALoRAs." - - alora_output = alora_for_this_request.generate_using_strings( - input=user_message, - response=assistant_message, - constraint=action.description, # type: ignore - stream=model_options.get(ModelOption.STREAM, False), + assert linearized_ctx is not None, ( + "If ctx.is_chat_context, then the context should be linearizable." ) + ctx_as_message_list: list[Message] = self.formatter.to_chat_messages( + linearized_ctx + ) + + conversation: list[dict] = [] + system_prompt = model_options.get(ModelOption.SYSTEM_PROMPT, "") + if system_prompt != "": + conversation.append({"role": "system", "content": system_prompt}) + + conversation.extend( + [OpenAIBackend.message_to_openai_message(m) for m in ctx_as_message_list] + ) + + docs = OpenAIBackend.messages_to_docs(ctx_as_message_list) + + seed = model_options.get(ModelOption.SEED, None) + if seed is not None: + set_seed(seed) + + if model_options.get(ModelOption.STREAM, None) is not None: + # Intrinsics don't support streaming because of their post-processing step. + FancyLogger.get_logger().warning( + "intrinsics cannot use streaming; removing model option" + ) + del model_options[ModelOption.STREAM] - # The alora function doesn't set up all the fields. - alora_output._context = linearized_ctx - alora_output._action = action - alora_output._model_options = model_options + adapter = get_adapter_for_intrinsic( + action.intrinsic_name, action.adapter_types, self._added_adapters + ) + if adapter is None: + raise ValueError( + f"backend ({self}) has no adapter for processing intrinsic: {action.intrinsic_name}" + ) - # TODO: Figure out what info we want to populate for aloras here. - alora_output._generate_log = GenerateLog() + # TODO: Code below this point is mostly specific to RagIntrinsics (and granite_common). + # It should be refactored into a specific adapter.transform() function. + assert isinstance(adapter, GraniteCommonAdapter), ( + "currently Mellea only supports GraniteCommonAdapters and Intrinsics" + ) - return alora_output + intrinsic_config = adapter.config + if intrinsic_config is None: + # If the adapter wasn't initialized with a config, grab one here based off the backend's model. + intrinsic_config_file = granite_common.intrinsics.util.obtain_io_yaml( + action.intrinsic_name, self._hf_model_id.split("/")[-1] + ) + intrinsic_config = granite_common.intrinsics.util.make_config_dict( + config_file=intrinsic_config_file + ) + intrinsic_config = cast( + dict, intrinsic_config + ) # TODO: Can remove if util function gets exported properly. + + rewriter = granite_common.IntrinsicsRewriter( + config_dict=intrinsic_config, model_name=adapter.name + ) + result_processor = granite_common.IntrinsicsResultProcessor( + config_dict=intrinsic_config + ) + + # Convert our conversation into a proper chat completions dict. + # [{role: user, content: Hello}, {...}] -> {messages: [{role:user,...}, ...], model:..., ...} + request_json: dict = { + "messages": conversation, + "extra_body": {"documents": docs}, + } + rewritten = rewriter.transform(request_json, **action.intrinsic_kwargs) + + # TODO: Handle caching here. granite_common doesn't tell us what changed, + # so we will have to invalidate the cache on our side. This requires + # us having specific caching for each Component/Message. + + self.load_adapter(adapter.qualified_name) + + # TODO: This modifies the underlying model. We should set a non-exclusive lock here. + # It should allow generate requests with the same adapter to proceed. This logic also + # needs to be added to the other generate functions. + self._model.set_adapter(adapter.qualified_name) + + generate_input, other_input = ( + granite_common.util.chat_completion_request_to_transformers_inputs( + rewritten, self._tokenizer, self._model + ) + ) + + chat_response: Coroutine[Any, Any, granite_common.ChatCompletionResponse] = ( + asyncio.to_thread( + granite_common.util.generate_with_transformers, + self._tokenizer, + self._model, + generate_input, + other_input, + ) + ) + + output = ModelOutputThunk(None) + output._context = ctx.view_for_generation() + output._action = action + output._model_options = model_options + + # Add another step to the processing function. + async def granite_common_processing( + mot: ModelOutputThunk, + chunk: granite_common.ChatCompletionResponse, + rewritten: granite_common.ChatCompletion, + result_processor: granite_common.IntrinsicsResultProcessor, + input_ids, + ): + res = result_processor.transform(chunk, rewritten) # type: ignore + + # TODO: If we want to support caches, we need to get the GenerateDecoderOnlyOutput. This means we + # probably need to break out the pieces from `generate_with_transformers`. + # processing expects a str or a GenerateDecoderOnlyOutput. Extract the str. + return await self.processing( + mot, res.choices[0].message.content, input_ids=input_ids + ) + + output._process = functools.partial( + granite_common_processing, + rewritten=rewritten, + result_processor=result_processor, + input_ids=generate_input["input_tokens"], + ) + + # TODO: Post-processing should release the lock for this generation. + output._post_process = functools.partial( + self.post_processing, + conversation=conversation, + input_ids=generate_input["input_tokens"], + _format=None, + tool_calls=False, + tools={}, + seed=seed, + ) + + try: + # To support lazy computation, will need to remove this create_task and store just the unexecuted coroutine. + # We can also support synchronous calls by adding a flag and changing this ._generate function. + + # This function should always be called from a running event loop so we don't have to worry about + # scheduling the task to a specific event loop here. + output._generate = asyncio.create_task( + send_to_queue(chat_response, output._async_queue) # type: ignore + ) + output._generate_type = GenerateType.ASYNC + except RuntimeError as e: + # Most likely cause is running this function without an event loop present. + raise e + + return output def _generate_from_context_standard( self, @@ -308,7 +437,7 @@ def _generate_from_context_standard( add_generation_prompt=True, # If we change this, must modify huggingface granite guardian. return_tensors="pt", **self._make_backend_specific_and_remove(model_options), - ).to(self._device) # type: ignore + ) format_kwargs = {} if _format: @@ -453,14 +582,14 @@ async def post_processing( assert mot.value is not None # Add an entry to the cache for ALora reuse. - if self._use_caches: + if self._use_caches and mot._meta.get("hf_output", None) is not None: output_complete = mot._meta["hf_output"].sequences[0] cache: DynamicCache = mot._meta["hf_output"].past_key_values # type: ignore cache_info = HFAloraCacheInfo( kv_cache=cache, merged_token_ids=output_complete, - merged_attention=torch.ones_like(output_complete).to(self._device), + merged_attention=torch.ones_like(output_complete), q_end=len(input_ids[0]), # type: ignore ) @@ -520,9 +649,7 @@ def generate_from_raw( prompts = [self.formatter.print(action) for action in actions] # batch-encoding call is deprecated in favor of this - inputs = self._tokenizer(prompts, return_tensors="pt", padding=True).to( - self._device - ) + inputs = self._tokenizer(prompts, return_tensors="pt", padding=True) if format is None: outputs = self._model.generate( # type: ignore @@ -681,72 +808,68 @@ def _filter_chat_template_only_options( } return {k: v for k, v in model_options.items() if k not in chat_template_only} - # region ALora loading, unloading, and utility functions. - def add_alora(self, alora: HFAlora): - """Loads an ALora for this backend. - - Args: - alora (str): identifier for the ALora adapter - """ - from alora.peft_model_alora import aLoRAPeftModelForCausalLM # type: ignore - - assert issubclass(alora.__class__, HFAlora), ( - f"cannot add an ALora of type {alora.__class__} to model; must inherit from {HFAlora.__class__}" - ) - assert alora._backend == self, "Cannot load an ALora into the wrong backend." + # region Adapter loading, unloading, and utility functions. + def add_adapter(self, adapter: LocalHFAdapter): + """Adds the given adapter to the backend. Must not have been added to a different backend.""" + if adapter.backend is not None: + if adapter.backend is self: + FancyLogger.get_logger().warning( + f"attempted to add adapter {adapter.name} with type {adapter.adapter_type} to the same backend {adapter.backend}" + ) + return + else: + raise Exception( + f"adapter {adapter.name} with type {adapter.adapter_type} has already been added to backend {adapter.backend}" + ) - if self.get_alora(alora.name) is not None: + if self._added_adapters.get(adapter.qualified_name) is not None: FancyLogger.get_logger().warning( - f"Client code attempted to add {alora.name} but {alora.name} was already added to {self.__class__}. The backend is refusing to do this, because ALora loading is not idempotent." + f"Client code attempted to add {adapter.name} with type {adapter.adapter_type} but {adapter.name} was already added to {self.__class__}. The backend is refusing to do this, because adapter loading is not idempotent." ) return None - if self.alora_model is None: - base_model = self._model - self.alora_model = aLoRAPeftModelForCausalLM.from_pretrained( - base_model, alora.path_or_model_id, alora.name + base_model_name = self._hf_model_id.split("/")[1] + adapter.path = adapter.get_local_hf_path(base_model_name) + adapter.backend = self + self._added_adapters[adapter.qualified_name] = adapter + + def load_adapter(self, adapter_qualified_name: str): + """Loads the given adapter for the backend. Must have previously been added.""" + adapter = self._added_adapters.get(adapter_qualified_name, None) + if adapter is None: + raise ValueError( + f"could not load adapter {adapter_qualified_name} for backend {self}: adapter was not previously added" ) - else: - self.alora_model.load_adapter(alora.path_or_model_id, alora.name) - - self._aloras[alora.name] = alora - - def get_alora(self, alora_name: str) -> Alora | None: - """Returns the ALora by name, or None if that ALora isn't loaded.""" - return self._aloras.get(alora_name) - - def get_aloras(self) -> list[Alora]: - """Returns a list of all loaded ALora adapters.""" - return list(self._aloras.values()) - - # endregion + try: + self._model.load_adapter(adapter.path, adapter.qualified_name) + except ValueError as e: + # If it's just that it's already loaded, ignore it. + if f"Adapter with name {adapter_qualified_name} already exists." not in str( + e + ): + raise e -class HFAlora(Alora, abc.ABC): - """ALoras that work with the local huggingface backend.""" + # Loading an adapter activates it. We disable adapters immediately after. + # Prefer this over `.disable_adapters()`; the disable function doesn't always + # seem to work. + self._model.set_adapter([]) + self._loaded_adapters[adapter.qualified_name] = adapter + + def unload_adapter(self, adapter_qualified_name: str): + """Unloads the given adapter from the backend.""" + # Check if the backend knows about this adapter. + adapter = self._loaded_adapters.get(adapter_qualified_name, None) + if adapter is None: + FancyLogger.get_logger().info( + f"could not unload adapter {adapter_qualified_name} for backend {self}: adapter is not loaded" + ) + return - def __init__( - self, - name: str, - path_or_model_id: str, - generation_prompt: str, - backend: LocalHFBackend, - ): - """Initialize an ALora that should work with huggingface backends that support ALoras. + self._model.delete_adapter(adapter.qualified_name) - Args: - name (str): An arbitrary name/label to assign to an ALora. This is irrelevant from the alora's (huggingface) model id. - path_or_model_id (str): A local path to ALora's weights or a Huggingface model_id to an ALora. - generation_prompt (str): A prompt used to "activate" the Lora. This string goes between the pre-activation context and the aLora generate call. This needs to be provided by the entity that trained the ALora. - backend (LocalHFBackend): Mained as a pointer to the backend to which this this ALora is attached. - """ - super().__init__(name) - self.path_or_model_id = path_or_model_id - self._backend = backend - self._generation_prompt = generation_prompt - self._generation_prompt_tokens = self._backend._tokenizer( - self._generation_prompt, return_tensors="pt" - ).to(self._backend._device) + # Remove the adapter from the list of loaded adapters. + del self._loaded_adapters[adapter.qualified_name] class HFProcessRewardModel(PRM, abc.ABC): @@ -764,23 +887,9 @@ def __init__( """ super().__init__(model_name_or_path) - # auto-device if not more specific - self._device = device - if device is None: - device_name: str = ( - "cuda" - if torch.cuda.is_available() - else "mps" - if torch.backends.mps.is_available() - else "cpu" - ) - assert device_name is not None - self._device = torch.device(device_name) # type: ignore - self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained( self.model_name_or_path, torch_dtype=torch.bfloat16 ) - self.model.to(self._device) # type: ignore self.model.eval() self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py index e147dfc4..52609250 100644 --- a/mellea/backends/openai.py +++ b/mellea/backends/openai.py @@ -5,22 +5,25 @@ import datetime import functools import inspect -import json from collections.abc import Callable, Coroutine -from enum import Enum -from typing import TYPE_CHECKING, Any, overload -from urllib.parse import urlparse +from typing import TYPE_CHECKING, Any, cast +import granite_common import openai import requests -from huggingface_hub import snapshot_download from openai.types.chat import ChatCompletion from openai.types.chat.chat_completion_chunk import ChatCompletionChunk from openai.types.completion import Completion import mellea.backends.model_ids as model_ids from mellea.backends import BaseModelSubclass -from mellea.backends.aloras import Alora, AloraBackendMixin +from mellea.backends.adapters.adapter import ( + AdapterMixin, + AdapterType, + GraniteCommonAdapter, + OpenAIAdapter, + get_adapter_for_intrinsic, +) from mellea.backends.formatter import Formatter, FormatterBackend, TemplateFormatter from mellea.backends.model_ids import ModelIdentifier from mellea.backends.tools import ( @@ -28,7 +31,7 @@ add_tools_from_model_options, convert_tools_to_json, ) -from mellea.backends.types import ModelOption +from mellea.backends.types import ModelOption, _server_type, _ServerType from mellea.helpers.async_helpers import ( ClientCache, get_current_event_loop, @@ -43,11 +46,13 @@ CBlock, Component, Context, + Document, GenerateLog, GenerateType, ModelOutputThunk, ) from mellea.stdlib.chat import Message +from mellea.stdlib.intrinsics.intrinsic import Intrinsic from mellea.stdlib.requirement import ALoraRequirement, LLMaJRequirement, Requirement if TYPE_CHECKING: @@ -58,25 +63,7 @@ format: None = None # typing this variable in order to shadow the global format function and ensure mypy checks for errors -class _ServerType(Enum): - LOCALHOST = 1 - OPENAI = 2 - - -def _server_type(url: str) -> _ServerType | None: - try: - parsed = urlparse(url) - hostname = parsed.hostname - if hostname in ("localhost", "127.0.0.1", "::1"): - return _ServerType.LOCALHOST - elif hostname == "api.openai.com": - return _ServerType.OPENAI - except Exception as e: - print(f"Error parsing URL: {e}") - return None - - -class OpenAIBackend(FormatterBackend, AloraBackendMixin): +class OpenAIBackend(FormatterBackend, AdapterMixin): """A generic OpenAI compatible backend.""" def __init__( @@ -170,6 +157,8 @@ def __init__( else: self._api_key = api_key + self._server_type = _server_type(self._base_url) + self._openai_client_kwargs = self.filter_openai_client_kwargs(**kwargs) self._client = openai.OpenAI( # type: ignore @@ -181,8 +170,10 @@ def __init__( # Call once to create an async_client and populate the cache. _ = self._async_client - # ALoras that have been loaded for this model. - self._aloras: dict[str, OpenAIAlora] = {} + # Adapters can be made know to the backend (added) and + # loaded / active. + self._added_adapters: dict[str, OpenAIAdapter] = {} + self._loaded_adapters: dict[str, OpenAIAdapter] = {} @property def _async_client(self) -> openai.AsyncOpenAI: @@ -302,14 +293,13 @@ def generate_from_context( assert ctx.is_chat_context, NotImplementedError( "The Openai backend only supports chat-like contexts." ) - mot = self.generate_from_chat_context( + return self.generate_from_chat_context( action, ctx, _format=format, model_options=model_options, tool_calls=tool_calls, ) - return mot, ctx.add(action).add(mot) def generate_from_chat_context( self, @@ -320,81 +310,206 @@ def generate_from_chat_context( | None = None, # Type[BaseModelSubclass] is a class object of a subclass of BaseModel model_options: dict | None = None, tool_calls: bool = False, - ) -> ModelOutputThunk: + ) -> tuple[ModelOutputThunk, Context]: """Generates a new completion from the provided Context using this backend's `Formatter`.""" - if issubclass(type(action), Requirement): - # The general rule is that we reroute to the alora if it exists. - reroute_to_alora = self.get_alora("constraint") is not None - # However, there are some exceptions: - if not self.default_to_constraint_checking_alora: - reroute_to_alora = False + # Requirements can be automatically rerouted to a requirement adapter. + if isinstance(action, Requirement): + # See docs/dev/requirement_aLoRA_rerouting.md + reroute_to_alora = self.default_to_constraint_checking_alora + adapter_name = "requirement_check" + + if isinstance(action, ALoraRequirement): + reroute_to_alora = True + adapter_name = action.intrinsic_name + alora_action = action + else: + assert action.description is not None, ( + "must have a description when generating from a requirement" + ) + alora_action = ALoraRequirement(action.description, adapter_name) + + # Check if a requirement_check (or AloraRequirement specified) adapter exists. + alora_req_adapter = get_adapter_for_intrinsic( + adapter_name, [AdapterType.ALORA], self._added_adapters + ) + if alora_req_adapter is None: + # Log a warning if using an AloraRequirement but no adapter fit. + if reroute_to_alora: + FancyLogger.get_logger().warning( + f"attempted to use an AloraRequirement but backend {self} doesn't have the specified adapter added {adapter_name}; defaulting to regular generation" + ) + if issubclass(type(action), LLMaJRequirement): reroute_to_alora = False - if issubclass(type(action), ALoraRequirement): - reroute_to_alora = True + if reroute_to_alora: - return self._generate_from_chat_context_alora( - action, ctx, _format=_format, model_options=model_options + # Keep the alora requirement handling separate for now. + mot = self._generate_from_intrinsic( + alora_action, ctx, model_options=model_options ) + return mot, ctx.add(alora_action).add(mot) - return self._generate_from_chat_context_standard( + elif isinstance(action, Intrinsic): + mot = self._generate_from_intrinsic( + action, ctx, model_options=model_options + ) + return mot, ctx.add(action).add(mot) + + mot = self._generate_from_chat_context_standard( action, ctx, _format=_format, model_options=model_options, tool_calls=tool_calls, ) + return mot, ctx.add(action).add(mot) - def _generate_from_chat_context_alora( - self, - action: Component | CBlock, - ctx: Context, - *, - _format: type[BaseModelSubclass] - | None = None, # Type[BaseModelSubclass] is a class object of a subclass of BaseModel - model_options: dict | None = None, + def _generate_from_intrinsic( + self, action: Intrinsic, ctx: Context, *, model_options: dict | None = None ) -> ModelOutputThunk: - match action: - case ALoraRequirement(): - alora_for_this_request = ( - self.get_alora("constraint") - if action.alora is None - else action.alora - ) - case _: - alora_for_this_request = self.get_alora("constraint") - assert alora_for_this_request is not None, ( - "This code block should not execute unless there is a 'constraint' alora loaded." - ) + model_opts = self._simplify_and_merge( + model_options, is_chat_context=ctx.is_chat_context + ) + if len(model_opts.items()) > 0: + FancyLogger.get_logger().info( + "passing in model options when generating with an adapter; some model options may be overwritten / ignored" + ) - # Construct the linearized context. This is very similar to normal generation. - linearized_ctx = ctx.view_for_generation() - assert linearized_ctx is not None and len(linearized_ctx) > 1 - msgs = self.formatter.to_chat_messages(linearized_ctx) - user_message, assistant_message = msgs[-2].content, msgs[-1].content - assert alora_for_this_request is not None - assert type(user_message) is str - assert type(assistant_message) is str - assert _format is None, "Structured outputs are not supported by ALoRAs." - - model_opts = self._simplify_and_merge(model_options, is_chat_context=True) - - alora_output = alora_for_this_request.generate_using_strings( - input=user_message, - response=assistant_message, - constraint=action.description, # type: ignore - stream=model_opts.get(ModelOption.STREAM, False), + linearized_context = ctx.view_for_generation() + assert linearized_context is not None, ( + "Cannot generate from a non-linear context in a FormatterBackend." ) + if len(linearized_context) == 0: + FancyLogger.get_logger().warning( + f"generating with an intrinsic when the context is empty; this is typically incorrect: {action}" + ) - # The alora function doesn't set up all the fields. - alora_output._context = linearized_ctx - alora_output._action = action - alora_output._model_options = model_options + # Convert our linearized context into a sequence of chat messages. Template formatters have a standard way of doing this. + messages: list[Message] = self.formatter.to_chat_messages(linearized_context) - # TODO: Figure out what info we want to populate for aloras here. - alora_output._generate_log = GenerateLog() + conversation: list[dict] = [] - return alora_output + system_prompt = model_opts.get(ModelOption.SYSTEM_PROMPT, "") + if system_prompt != "": + conversation.append({"role": "system", "content": system_prompt}) + conversation.extend([self.message_to_openai_message(m) for m in messages]) + docs = self.messages_to_docs(messages) + + if model_opts.get(ModelOption.STREAM, None) is not None: + # Intrinsics don't support streaming because of their post-processing step. + FancyLogger.get_logger().warning( + "intrinsics cannot use streaming; removing model option" + ) + del model_opts[ModelOption.STREAM] + + adapter = get_adapter_for_intrinsic( + action.intrinsic_name, action.adapter_types, self._added_adapters + ) + if adapter is None: + raise ValueError( + f"backend ({self}) has no adapter for processing intrinsic: {action.intrinsic_name}" + ) + + # TODO: Code below this point is mostly specific to RagIntrinsics (and granite_common). + # It should be refactored into a specific adapter.transform() function. + assert isinstance(adapter, GraniteCommonAdapter), ( + "currently Mellea only supports GraniteCommonAdapters and Intrinsics" + ) + + intrinsic_config = adapter.config + if intrinsic_config is None: + # If the adapter wasn't initialized with a config, grab one here based off the backend's model. + intrinsic_config_file = granite_common.intrinsics.util.obtain_io_yaml( + action.intrinsic_name, self._hf_model_id.split("/")[-1] + ) + intrinsic_config = granite_common.intrinsics.util.make_config_dict( + config_file=intrinsic_config_file + ) + intrinsic_config = cast( + dict, intrinsic_config + ) # TODO: Can remove if util function gets exported properly. + + rewriter = granite_common.IntrinsicsRewriter( + config_dict=intrinsic_config, model_name=adapter.qualified_name + ) + result_processor = granite_common.IntrinsicsResultProcessor( + config_dict=intrinsic_config + ) + + # Convert our conversation into a proper chat completions dict. + # [{role: user, content: Hello}, {...}] -> {messages: [{role:user,...}, ...], model:..., ...} + request_json: dict = { + "messages": conversation, + "extra_body": {"documents": docs}, + } + + rewritten = rewriter.transform(request_json, **action.intrinsic_kwargs) + + self.load_adapter(adapter.qualified_name) + chat_response: Coroutine[Any, Any, ChatCompletion] = ( + self._async_client.chat.completions.create(**rewritten.model_dump()) + ) + + output = ModelOutputThunk(None) + output._context = linearized_context + output._action = action + output._model_options = model_opts + output._meta["granite_common_chat_response"] = rewritten + + # Add another step to the processing function. + async def granite_common_processing( + mot: ModelOutputThunk, + chunk: ChatCompletion, + rewritten: ChatCompletion, + result_processor: granite_common.IntrinsicsResultProcessor, + ): + res = result_processor.transform(chunk, rewritten) # type: ignore + + # processing expects a ChatCompletion object. Granite common differs slightly from this. Re-create the necessary object. + full_res = ChatCompletion( + id=chunk.id, + choices=[], + created=chunk.created, + model=chunk.model, + usage=chunk.usage, + object="chat.completion", + ) + + # Set the choices here so that pydantic validation doesn't error out. + full_res.choices = res.choices # type: ignore + + return await self.processing(mot, full_res) + + output._process = functools.partial( + granite_common_processing, + rewritten=rewritten, # type: ignore + result_processor=result_processor, + ) + + output._post_process = functools.partial( + self.post_processing, + tools={}, + conversation=conversation, + thinking=None, + seed=model_opts.get(ModelOption.SEED, None), + _format=None, + ) + + try: + # To support lazy computation, will need to remove this create_task and store just the unexecuted coroutine. + # We can also support synchronous calls by adding a flag and changing this ._generate function. + + # This function should always be called from a running event loop so we don't have to worry about + # scheduling the task to a specific event loop here. + output._generate = asyncio.create_task( + send_to_queue(chat_response, output._async_queue) + ) + output._generate_type = GenerateType.ASYNC + except RuntimeError as e: + # Most likely cause is running this function without an event loop present + raise e + + return output @staticmethod def message_to_openai_message(msg: Message): @@ -431,6 +546,23 @@ def message_to_openai_message(msg: Message): # ] # } + @staticmethod + def messages_to_docs(msgs: list[Message]) -> list[dict[str, str]]: + """Extracts the docs from a list of messages.""" + docs: list[Document] = [] + for message in msgs: + if message._docs is not None: + docs.extend(message._docs) + + # TODO: We can add doc_ids here for vllm if needed. + json_docs: list[dict[str, str]] = [] + for doc in docs: + json_doc = {"text": doc.text} + if doc.title is not None: + json_doc["title"] = doc.title + json_docs.append(json_doc) + return json_docs + def _generate_from_chat_context_standard( self, action: Component | CBlock, @@ -726,41 +858,83 @@ def generate_from_raw( return results - def add_alora(self, alora: "OpenAIAlora"): - """Loads an ALora for this backend. - - Args: - alora (str): identifier for the ALora adapter - """ - assert issubclass(alora.__class__, OpenAIAlora), ( - f"cannot add an ALora of type {alora.__class__} to model; must inherit from {OpenAIAlora.__class__}" - ) - assert alora._backend == self, "Cannot load an ALora into the wrong backend." + def add_adapter(self, adapter: OpenAIAdapter): + """Adds the given adapter to the backend. Must not have been added to a different backend.""" + if adapter.backend is not None: + if adapter.backend is self: + FancyLogger.get_logger().warning( + f"attempted to add adapter {adapter.name} with type {adapter.adapter_type} to the same backend {adapter.backend}" + ) + return + else: + raise Exception( + f"adapter {adapter.name} with type {adapter.adapter_type} has already been added to backend {adapter.backend}" + ) - if self.get_alora(alora.name) is not None: + if self._added_adapters.get(adapter.qualified_name, None) is not None: FancyLogger.get_logger().warning( - f"Client code attempted to add {alora.name} but {alora.name} was already added to {self.__class__}. The backend is refusing to do this, because ALora loading is not idempotent." + f"Client code attempted to add {adapter.name} with type {adapter.adapter_type} but it was already added to {self.__class__}. This attempt to add the adapter will be ignored." ) return None - assert _server_type(self._base_url) == _ServerType.LOCALHOST, ( - "alora is supported only for locally running vllm instances" + base_model_name = self._hf_model_id.split("/")[-1] + adapter.path = adapter.get_open_ai_path( + base_model_name, server_type=self._server_type + ) + adapter.backend = self + self._added_adapters[adapter.qualified_name] = adapter + + def load_adapter(self, adapter_qualified_name: str): + """Loads the given adapter for the backend. Must have previously been added.""" + adapter = self._added_adapters.get(adapter_qualified_name, None) + if adapter is None: + raise ValueError( + f"could not load adapter {adapter_qualified_name} for backend {self}: adapter was not previously added" + ) + + url = f"{self._base_url}/load_lora_adapter" + response = requests.post( + url, + json={"lora_name": adapter_qualified_name, "lora_path": adapter.path}, + headers={"Content-Type": "application/json"}, ) - snapshot_path = snapshot_download(alora.path) + err: str | None = None + match response.status_code: + case 200: + FancyLogger.get_logger().info( + f"{url}: status {response.status_code} {response.text}" + ) + case 400: + if "has already been loaded." in str(response.content): + FancyLogger.get_logger().warning( + f"{url}: status {response.status_code} {response.text}" + ) + else: + err = f"{url}: status {response.status_code} {response.text}" + case _: + err = f"{url}: status {response.status_code} {response.text}" - # https://docs.vllm.ai/en/stable/features/lora.html#using-api-endpoints - # curl -X POST http://localhost:8000/v1/load_lora_adapter \ - # -H "Content-Type: application/json" \ - # -d '{ - # "lora_name": "sql_adapter", - # "lora_path": "/path/to/sql-lora-adapter" - # }' + if err is not None: + FancyLogger.get_logger().error(err) + raise Exception(f"error loading adapter {adapter_qualified_name}: {err}") - url = f"{self._base_url}/load_lora_adapter" + self._loaded_adapters[adapter.qualified_name] = adapter + + def unload_adapter(self, adapter_qualified_name: str): + """Unloads the given adapter from the backend.""" + # Check if the backend knows about this adapter. + adapter = self._loaded_adapters.get(adapter_qualified_name, None) + if adapter is None: + FancyLogger.get_logger().info( + f"could not unload adapter {adapter_qualified_name} for backend {self}: adapter is not loaded" + ) + return + + url = f"{self._base_url}/unload_lora_adapter" response = requests.post( url, - json={"lora_name": alora.name, "lora_path": snapshot_path}, + json={"lora_name": adapter_qualified_name}, headers={"Content-Type": "application/json"}, ) @@ -769,23 +943,23 @@ def add_alora(self, alora: "OpenAIAlora"): FancyLogger.get_logger().info( f"{url}: status {response.status_code} {response.text}" ) - self._aloras[alora.name] = alora + case 404: + # This response code indicates that the adapter isn't currently loaded; + # which is the goal of this function. Log it but proceed as if successful. + FancyLogger.get_logger().info( + f"{url}: status {response.status_code} {response.text}" + ) case _: + # Unknown err. FancyLogger.get_logger().error( f"{url}: status {response.status_code} {response.text}" ) + raise Exception( + f"error unloading adapter {adapter_qualified_name}: {url}: status {response.status_code} {response.text}" + ) - self._aloras[alora.name] = alora - - return None - - def get_alora(self, alora_name: str) -> Alora | None: - """Returns the ALora by name, or None if that ALora isn't loaded.""" - return self._aloras.get(alora_name) - - def get_aloras(self) -> list[Alora]: - """Returns a list of all loaded ALora adapters.""" - return list(self._aloras.values()) + # Remove the adapter from the list of loaded adapters. + del self._loaded_adapters[adapter.qualified_name] def apply_chat_template(self, chat: list[dict[str, str]]): """Apply the chat template for the model, if such a model is available (e.g., when it can deduce the huggingface model id).""" @@ -805,23 +979,3 @@ def apply_chat_template(self, chat: list[dict[str, str]]): ) return self._tokenizer.apply_chat_template(chat, tokenize=False) - - -class OpenAIAlora(Alora, abc.ABC): - """ALoras that work with OpenAI backend.""" - - def __init__( - self, name: str, path: str, generation_prompt: str, backend: OpenAIBackend - ): - """Initialize an ALora that should work with OpenAI backends that support ALoras. - - Args: - name (str): An arbitrary name/label to assign to an ALora. This is irrelevant from the alora's (huggingface) model id. - path (str): A local path to ALora's weights or a Huggingface model_id to an ALora. - generation_prompt (str): A prompt used to "activate" the Lora. This string goes between the pre-activation context and the aLora generate call. This needs to be provided by the entity that trained the ALora. - backend (OpenAIBackend): Mained as a pointer to the backend to which this this ALora is attached. - """ - super().__init__(name) - self.path = path - self._backend = backend - self._generation_prompt = generation_prompt diff --git a/mellea/backends/process_reward_models/huggingface/prms.py b/mellea/backends/process_reward_models/huggingface/prms.py index 2525b8e6..c8c7be78 100644 --- a/mellea/backends/process_reward_models/huggingface/prms.py +++ b/mellea/backends/process_reward_models/huggingface/prms.py @@ -71,7 +71,7 @@ def score(self, query: str, response: str) -> tuple[list[float], list[list[float # move each item of the batch to the device for i in batches: - batches[i] = batches[i].to(self.model.device) + batches[i] = batches[i] with torch.no_grad(): model_outputs = self.model(**batches) @@ -178,7 +178,7 @@ def __init__( # initialize PRM head self.prm_head = torch.nn.Linear( self.model.config.hidden_size, 2, bias=False, dtype=self.model.dtype - ).to(self.model.device) + ) state = torch.load(model_name_or_path + "/added_params.bin") # need to do this-- we save model dict as `prm_head.weight` during training @@ -205,7 +205,7 @@ def score(self, query: str, response: str) -> tuple[list[float], list[list[float batch = self.prepare_inputs(query, list_of_steps) # move each item of the batch to the device for i in batch: - batch[i] = batch[i].to(self.model.device) + batch[i] = batch[i] with torch.no_grad(): model_outputs = self.model(**batch, output_hidden_states=True) diff --git a/mellea/backends/types.py b/mellea/backends/types.py index d7f0db12..89f03851 100644 --- a/mellea/backends/types.py +++ b/mellea/backends/types.py @@ -1,6 +1,8 @@ """Useful type definitions for models, formatters, and backends.""" +from enum import Enum from typing import Any +from urllib.parse import urlparse from mellea.helpers.fancy_logger import FancyLogger @@ -109,3 +111,27 @@ def merge_model_options( for k, v in overwrite_opts.items(): new_options[k] = v return new_options + + +class _ServerType(Enum): + """Different types of servers that might be relevant for a backend.""" + + UNKNOWN = 0 + LOCALHOST = 1 + OPENAI = 2 + REMOTE_VLLM = 3 + """Must be set manually for now.""" + + +def _server_type(url: str) -> _ServerType: + """Find a server type based on the url.""" + try: + parsed = urlparse(url) + hostname = parsed.hostname + if hostname in ("localhost", "127.0.0.1", "::1", "0.0.0.0"): + return _ServerType.LOCALHOST + elif hostname == "api.openai.com": + return _ServerType.OPENAI + except Exception as e: + print(f"Error parsing URL: {e}") + return _ServerType.UNKNOWN diff --git a/mellea/stdlib/base.py b/mellea/stdlib/base.py index bf0c1954..cc546a4b 100644 --- a/mellea/stdlib/base.py +++ b/mellea/stdlib/base.py @@ -149,6 +149,32 @@ def get_images_from_component(c: Component) -> None | list[ImageBlock]: return None +# TODO: Add support for passing in docs as model options. +class Document(Component): + """Documents should typically be used in a Message object.""" + + def __init__(self, text: str, title: str | None = None): + """Create a document object. Should typically be used as a list in the `_docs` field of Message.""" + self.text = text + self.title = title + + def parts(self) -> list[Component | CBlock]: + """The set of all the constituent parts of the `Component`.""" + raise NotImplementedError("parts isn't implemented by default") + + def format_for_llm(self) -> str: + """Formats the `Document` into a string. + + Returns: a string + """ + doc = "" + if self.title is not None: + doc += f"'{self.title}': " + doc += f"{self.text}" + + return doc + + class GenerateType(enum.Enum): """Used to track what functions can be used to extract a value from a ModelOutputThunk.""" diff --git a/mellea/stdlib/chat.py b/mellea/stdlib/chat.py index 7f5bbb4a..574e6fa6 100644 --- a/mellea/stdlib/chat.py +++ b/mellea/stdlib/chat.py @@ -8,6 +8,7 @@ CBlock, Component, Context, + Document, ImageBlock, ModelOutputThunk, ModelToolCall, @@ -26,6 +27,7 @@ def __init__( content: str, *, images: None | list[ImageBlock] = None, + documents: None | list[Document] = None, ): """Initializer for Chat messages. @@ -33,10 +35,12 @@ def __init__( role (str): The role that this message came from (e.g., user, assistant). content (str): The content of the message. images (list[ImageBlock]): The images associated with the message if any. + documents (list[Document]): documents associated with the message if any. """ self.role = role self.content = content self._images = images + self._docs = documents @property def images(self) -> None | list[str]: @@ -59,7 +63,12 @@ def format_for_llm(self) -> TemplateRepresentation: """ return TemplateRepresentation( obj=self, - args={"role": self.role, "content": self.content, "images": self.images}, + args={ + "role": self.role, + "content": self.content, + "images": self.images, + "documents": self._docs, + }, template_order=["*", "Message"], ) @@ -68,7 +77,11 @@ def __str__(self): images = [] if self.images is not None: images = [f"{i[:20]}..." for i in self.images] - return f'mellea.Message(role="{self.role}", content="{self.content}", images="{images}")' + + docs = [] + if self._docs is not None: + docs = [f"{doc.format_for_llm()[:10]}..." for doc in self._docs] + return f'mellea.Message(role="{self.role}", content="{self.content}", images="{images}", documents="{docs}")' class ToolMessage(Message): diff --git a/mellea/stdlib/intrinsics/intrinsic.py b/mellea/stdlib/intrinsics/intrinsic.py new file mode 100644 index 00000000..2d6e9598 --- /dev/null +++ b/mellea/stdlib/intrinsics/intrinsic.py @@ -0,0 +1,59 @@ +"""Module for Intrinsics.""" + +import pathlib +from copy import copy +from typing import cast + +from mellea.backends.adapters.adapter import AdapterType +from mellea.stdlib.base import CBlock, Component, TemplateRepresentation + + +class Intrinsic(Component): + """A component representing an intrinsic.""" + + def __init__( + self, + intrinsic_name: str, + intrinsic_kwargs: dict | None = None, + adapter_types: list[AdapterType] = [AdapterType.ALORA, AdapterType.LORA], + ) -> None: + """A component for rewriting messages using intrinsics. + + Intrinsics are special components that transform a chat completion request. + These transformations typically take the form of: + - parameter changes (typically structured outputs) + - adding new messages to the chat + - editing existing messages + + An intrinsic component should correspond to a loaded adapter. + + Args: + intrinsic_name: the name of the intrinsic; must match the adapter + intrinsic_kwargs: some intrinsics require kwargs when utilizing them; provide them here + adapter_types: list of adapter types that can be used for this intrinsic + """ + self.intrinsic_name = intrinsic_name + + # Copy the list so that this intrinsic has its own list that can be modified independently. + self.adapter_types = copy(adapter_types) + + if intrinsic_kwargs is None: + intrinsic_kwargs = {} + self.intrinsic_kwargs = intrinsic_kwargs + + def parts(self) -> list[Component | CBlock]: + """The set of all the constituent parts of the `Intrinsic`. + + Will need to be implemented by subclasses since not all intrinsics are output + as text / messages. + """ + raise NotImplementedError("parts isn't implemented by default") + + def format_for_llm(self) -> TemplateRepresentation | str: + """`Intrinsic` doesn't implement `format_for_default`. Formats the `Intrinsic` into a `TemplateRepresentation` or string. + + Returns: a `TemplateRepresentation` or string + """ + raise NotImplementedError( + "`Intrinsic` doesn't implement format_for_llm by default. You should only use an `Intrinsic` as the action and not as a part of the context." + ) diff --git a/mellea/stdlib/requirement.py b/mellea/stdlib/requirement.py index f10a3aaf..3168e7f4 100644 --- a/mellea/stdlib/requirement.py +++ b/mellea/stdlib/requirement.py @@ -1,13 +1,14 @@ """Requirements are a special type of Component used as input to the "validate" step in Instruct/Validate/Repair design patterns.""" import inspect +import json import re from collections.abc import Callable from copy import copy from typing import Any, overload from mellea.backends import Backend, BaseModelSubclass -from mellea.backends.aloras import Alora +from mellea.backends.adapters.adapter import AdapterType from mellea.helpers.fancy_logger import FancyLogger from mellea.stdlib.base import ( CBlock, @@ -17,6 +18,7 @@ ModelOutputThunk, TemplateRepresentation, ) +from mellea.stdlib.intrinsics.intrinsic import Intrinsic def default_output_to_bool(x: CBlock | str) -> bool: @@ -176,19 +178,54 @@ class LLMaJRequirement(Requirement): use_aloras: bool = False -class ALoraRequirement(Requirement): +def requirement_check_to_bool(x: CBlock | str) -> bool: + """Checks if a given output should be marked converted to `True`. + + By default, the requirement check alora outputs: `{"requirement_likelihood": 0.0}`. + True if >.5 + """ + output = str(x) + req_dict: dict[str, Any] = json.loads(output) + + likelihood = req_dict.get("requirement_likelihood", None) + if likelihood is None: + FancyLogger.get_logger().warning( + f"could not get value from alora requirement output; looking for `requirement_likelihood` in {req_dict}" + ) + return False + + if likelihood > 0.5: + return True + + return False + + +class ALoraRequirement(Requirement, Intrinsic): """A requirement that always uses an (possibly specified) ALora. If an exception is thrown during the ALora execution path, `mellea` will fall back to LLMaJ. But that is the only case where LLMaJ will be used.""" - def __init__(self, description: str, alora: Alora | None = None): + def __init__(self, description: str, intrinsic_name: str | None = None): """A requirement that is validated by an ALora. Args: description: See `Requirement.__init__` - alora: if None, the ALora with name "constraint" will be used. + intrinsic_name: the name of the intrinsic; must match the adapter """ - super().__init__(description, validation_fn=None) + # TODO: We may want to actually do the validation_fn here so that we can set the score. + super().__init__( + description, validation_fn=None, output_to_bool=requirement_check_to_bool + ) self.use_aloras: bool = True - self.alora = alora + + if intrinsic_name is None: + intrinsic_name = "requirement_check" + + self.intrinsic_name = intrinsic_name + self.adapter_types = [AdapterType.ALORA] + + @property + def intrinsic_kwargs(self): + """An AloraRequirement's intrinsic kwarg is always the requirement's description.""" + return {"requirement": f"{self.description}"} class ScorerRequirement(Requirement): diff --git a/pyproject.toml b/pyproject.toml index b60f2a6a..d8dfe140 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,8 @@ dependencies = [ "huggingface-hub>=0.33.4", "pillow", "math_verify", # Needed for Majority Voting Sampling Strategies. - "rouge_score" # Needed for Majority Voting Sampling Strategies. + "rouge_score", # Needed for Majority Voting Sampling Strategies. + "granite_common", # Needed for Intrinsics. ] [project.scripts] @@ -71,7 +72,8 @@ hf = [ "datasets>=4.0.0", "outlines-core==0.1.26", "outlines", # intentionally un-versioned, expecting a minor update. coutlines-core version should be enough to specify it - "peft>=0.16.0", + # "peft>=0.17.2", # This package can be re-enabled once peft 0.17.2 has been released and below commit is confirmed to be a part of it. + "peft @ git+https://github.com/huggingface/peft.git@293aea5df6db240856a77f89955d1a89ce38b50d", "transformers>=4.53.2", "trl==0.19.1", ] diff --git a/test/backends/test_adapters/intrinsics-data/answerability.yaml b/test/backends/test_adapters/intrinsics-data/answerability.yaml new file mode 100644 index 00000000..2b72cd18 --- /dev/null +++ b/test/backends/test_adapters/intrinsics-data/answerability.yaml @@ -0,0 +1,25 @@ +# Model name string, or null to use whatever is provided in the chat completion request. +model: ~ +# JSON schema of the model's output +response_format: | + { + "type": "string", + "enum": ["answerable", "unanswerable"] + } +transformations: + # Convert categorical answer to continuous value by decoding logprobs + - type: likelihood + categories_to_values: + "answerable": 1.0 + "unanswerable": 0.0 + input_path: [] + # Convert scalar value to a record for consistency with other intrinsics + - type: nest + input_path: [] + field_name: "answerability_likelihood" +instruction: ~ +parameters: + # "unanswerable" can be 6 tokens at high temperatures + max_completion_tokens: 6 +# No sentence boundary detection +sentence_boundaries: ~ diff --git a/test/backends/test_adapters/test_adapter.py b/test/backends/test_adapters/test_adapter.py new file mode 100644 index 00000000..b7ae746e --- /dev/null +++ b/test/backends/test_adapters/test_adapter.py @@ -0,0 +1,18 @@ +import pathlib +import pytest + +from mellea.backends.adapters.adapter import GraniteCommonAdapter + +# The backend tests handle most of the adapter testing. Do a basic test here +# to make sure init and config loading work. +def test_adapter_init(): + dir_file = pathlib.Path(__file__).parent.joinpath("intrinsics-data") + answerability_file = f"{dir_file}/answerability.yaml" + + adapter = GraniteCommonAdapter("answerability", config_file=answerability_file) + + assert adapter.config is not None + assert adapter.config["parameters"]["max_completion_tokens"] == 6 + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index c8cd3a74..5ce264eb 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -4,7 +4,7 @@ from typing_extensions import Annotated from mellea import MelleaSession -from mellea.backends.aloras.huggingface.granite_aloras import add_granite_aloras +from mellea.backends.adapters.adapter import GraniteCommonAdapter from mellea.backends.cache import SimpleLRUCache from mellea.backends.formatter import TemplateFormatter from mellea.backends.huggingface import LocalHFBackend @@ -23,11 +23,11 @@ def backend(): """Shared HuggingFace backend for all tests in this module.""" backend = LocalHFBackend( - model_id="ibm-granite/granite-3.2-8b-instruct", + model_id="ibm-granite/granite-3.3-8b-instruct", formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"), cache=SimpleLRUCache(5), ) - add_granite_aloras(backend) + backend.add_adapter(GraniteCommonAdapter("requirement_check")) return backend @@ -38,6 +38,20 @@ def session(backend): yield session session.reset() +def test_adapters(backend): + assert len(backend._added_adapters.items()) > 0 + + adapter = backend._added_adapters["requirement_check_alora"] + backend.load_adapter(adapter.qualified_name) + assert adapter.qualified_name in backend._loaded_adapters + + # Ensure you can load the same adapter twice. + backend.load_adapter(adapter.qualified_name) + + # Ensure you can unload an adapter. + backend.unload_adapter(adapter.qualified_name) + backend.unload_adapter(adapter.qualified_name) + assert adapter.qualified_name not in backend._loaded_adapters @pytest.mark.qualitative def test_system_prompt(session): @@ -48,27 +62,6 @@ def test_system_prompt(session): print(result) -@pytest.mark.qualitative -async def test_constraint_alora(session, backend): - answer = session.instruct( - "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.", - model_options={ - ModelOption.MAX_NEW_TOKENS: 300 - }, # Until aloras get a bit better, try not to abruptly end generation. - ) - - alora_output = backend.get_aloras()[ - 0 - ].generate_using_strings( - input="Find the difference between these two strings: aaaaaaaaaa aaaaabaaaa", - response=str(answer), - constraint="The answer mention that there is a b in the middle of one of the strings but not the other.", - force_yn=False, # make sure that the alora naturally output Y and N without constrained generation - ) - await alora_output.avalue() - assert alora_output.value in ["Y", "N"], alora_output - - @pytest.mark.qualitative def test_constraint_lora_with_requirement(session, backend): answer = session.instruct( @@ -83,7 +76,7 @@ def test_constraint_lora_with_requirement(session, backend): assert len(validation_outputs) == 1 val_result = validation_outputs[0] assert isinstance(val_result, ValidationResult) - assert str(val_result.reason) in ["Y", "N"] + assert "requirement_likelihood" in str(val_result.reason) @pytest.mark.qualitative @@ -116,7 +109,7 @@ def test_constraint_lora_override_does_not_override_alora(session, backend): assert len(validation_outputs) == 1 val_result = validation_outputs[0] assert isinstance(val_result, ValidationResult) - assert str(val_result.reason) in ["Y", "N"] + assert "requirement_likelihood" in str(val_result.reason) backend.default_to_constraint_checking_alora = True @@ -135,6 +128,7 @@ def test_llmaj_req_does_not_use_alora(session, backend): val_result = validation_outputs[0] assert isinstance(val_result, ValidationResult) assert str(val_result.reason) not in ["Y", "N"] + assert "requirement_likelihood" not in str(val_result.reason) @pytest.mark.qualitative diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py index f78898ec..df59eae8 100644 --- a/test/backends/test_huggingface_tools.py +++ b/test/backends/test_huggingface_tools.py @@ -4,7 +4,6 @@ import mellea.backends.model_ids as model_ids from mellea import MelleaSession -from mellea.backends.aloras.huggingface.granite_aloras import add_granite_aloras from mellea.backends.cache import SimpleLRUCache from mellea.backends.formatter import TemplateFormatter from mellea.backends.huggingface import LocalHFBackend diff --git a/test/backends/test_openai_vllm/environment.yml b/test/backends/test_openai_vllm/environment.yml index 2d0b9e8e..5ca4116b 100644 --- a/test/backends/test_openai_vllm/environment.yml +++ b/test/backends/test_openai_vllm/environment.yml @@ -9,3 +9,4 @@ dependencies: variables: VLLM_USE_PRECOMPILED: 1 # need this flag for alora fork, installation fails otherwise VLLM_ALLOW_RUNTIME_LORA_UPDATING: True # allow loading (a)lora through POST http://localhost:8000/v1/load_lora_adapter + VLLM_DOWNLOAD_RAG_INTRINSICS: False # if True, download the rag-intrinsics-lib (https://huggingface.co/ibm-granite/rag-intrinsics-lib/tree/main); only required for remote vllm servers diff --git a/test/backends/test_openai_vllm/serve.sh b/test/backends/test_openai_vllm/serve.sh index 7746eed1..9b061d0e 100755 --- a/test/backends/test_openai_vllm/serve.sh +++ b/test/backends/test_openai_vllm/serve.sh @@ -26,8 +26,18 @@ # see environment.yml. export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True +# Mellea makes assumptions about the location of the adapter files on the server. By default, it assumes +# referenced adapters are at `./rag-intrinsics-lib/$adapter_name/$adapter_type/$base_model_name`. You can +# change this behavior by defining custom adapter classes that override the path. +# You will also need to set the OpenAIBackend's server_type to REMOTE_VLLM. +if [[ "$VLLM_ALLOW_RUNTIME_LORA_UPDATING" == "True" ]] && [[ "$VLLM_DOWNLOAD_RAG_INTRINSICS" == "True" ]] +then + echo "downloading rag-intrinsics-lib from huggingface" + hf download ibm-granite/rag-intrinsics-lib --local-dir ./rag-intrinsics-lib +fi + echo "launching a vllm server. Logs are found in $(readlink -ef $(dirname $0))/vllm.log" -vllm serve ibm-granite/granite-3.2-8b-instruct \ +vllm serve ibm-granite/granite-3.3-8b-instruct \ --enable-activated-lora \ --enable-lora \ --dtype bfloat16 \ diff --git a/test/backends/test_openai_vllm/test_openai_vllm.py b/test/backends/test_openai_vllm/test_openai_vllm.py index 30dff26a..63849833 100644 --- a/test/backends/test_openai_vllm/test_openai_vllm.py +++ b/test/backends/test_openai_vllm/test_openai_vllm.py @@ -1,23 +1,17 @@ # test/rits_backend_tests/test_openai_integration.py from mellea import MelleaSession +from mellea.backends.adapters.adapter import GraniteCommonAdapter from mellea.stdlib.base import CBlock, ModelOutputThunk, ChatContext from mellea.backends.openai import OpenAIBackend -from mellea.backends.aloras.openai.granite_aloras import add_granite_aloras -from mellea.stdlib.requirement import ( - Requirement, - ALoraRequirement, - LLMaJRequirement, - req, -) +from mellea.stdlib.requirement import Requirement, ALoraRequirement, LLMaJRequirement, req from mellea.backends.formatter import TemplateFormatter -from mellea.backends.types import ModelOption +from mellea.backends.types import _ServerType, ModelOption import pydantic from typing_extensions import Annotated import pytest import os - # The vllm tests are disabled by default, because we need a test environment with the vLLM server running. # We use an env var VLLM_TESTS_ENABLED to enable these tests. # to run the tests, do this: VLLM_TESTS_ENABLED="1' pytest test_openai_vllm.py @@ -34,8 +28,8 @@ class TestOpenAIBackend: backend = OpenAIBackend( - model_id="ibm-granite/granite-3.2-8b-instruct", - formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"), + model_id="ibm-granite/granite-3.3-8b-instruct", + formatter=TemplateFormatter(model_id="ibm-granite/granite-3.3-8b-instruct"), base_url="http://0.0.0.0:8000/v1", api_key="EMPTY", ) @@ -136,13 +130,29 @@ class Answer(pydantic.BaseModel): class TestOpenAIALoraStuff: backend = OpenAIBackend( - model_id="ibm-granite/granite-3.2-8b-instruct", + model_id="ibm-granite/granite-3.3-8b-instruct", formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"), base_url="http://localhost:8000/v1", api_key="EMPTY", ) + backend.add_adapter(GraniteCommonAdapter("requirement_check")) + m = MelleaSession(backend, ctx=ChatContext()) - add_granite_aloras(backend) + + def test_adapters(self): + assert len(self.backend._added_adapters.items()) > 0 + + adapter = self.backend._added_adapters["requirement_check_alora"] + self.backend.load_adapter(adapter.qualified_name) + assert adapter.qualified_name in self.backend._loaded_adapters + + # Ensure you can load the same adapter twice. + self.backend.load_adapter(adapter.qualified_name) + + # Ensure you can unload an adapter. + self.backend.unload_adapter(adapter.qualified_name) + self.backend.unload_adapter(adapter.qualified_name) + assert adapter.qualified_name not in self.backend._loaded_adapters def test_system_prompt(self): self.m.reset() @@ -152,23 +162,6 @@ def test_system_prompt(self): ) print(result) - @pytest.mark.xfail - def test_constraint_alora(self): - self.m.reset() - answer = self.m.instruct( - "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa" - ) - alora_output = self.backend.get_aloras()[ - 0 - ].generate_using_strings( - input="Find the difference between these two strings: aaaaaaaaaa aaaaabaaaa", - response=str(answer), - constraint="The answer mention that there is a b in the middle of one of the strings but not the other.", - force_yn=False, # make sure that the alora naturally output Y and N without constrained generation - ) - assert alora_output in ["Y", "N"], alora_output - self.m.reset() - def test_constraint_lora_with_requirement(self): self.m.reset() answer = self.m.instruct( @@ -181,7 +174,7 @@ def test_constraint_lora_with_requirement(self): ) assert len(validation_outputs) == 1 val_result = validation_outputs[0] - assert str(val_result.reason) in ["Y", "N"] + assert "requirement_likelihood" in str(val_result.reason) self.m.reset() def test_constraint_lora_override(self): @@ -214,7 +207,7 @@ def test_constraint_lora_override_does_not_override_alora(self): ) assert len(validation_outputs) == 1 non_alora_output = validation_outputs[0] - assert str(non_alora_output.reason) in ["Y", "N"] + assert "requirement_likelihood" in str(non_alora_output.reason) self.backend.default_to_constraint_checking_alora = True self.m.reset() diff --git a/test/stdlib_basics/test_base.py b/test/stdlib_basics/test_base.py index e19c6adc..917619e0 100644 --- a/test/stdlib_basics/test_base.py +++ b/test/stdlib_basics/test_base.py @@ -26,6 +26,5 @@ def format_for_llm(self) -> str: c = _ClosuredComponent() assert len(c.parts()) == 0 - if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/stdlib_basics/test_chat.py b/test/stdlib_basics/test_chat.py new file mode 100644 index 00000000..3ae911b9 --- /dev/null +++ b/test/stdlib_basics/test_chat.py @@ -0,0 +1,25 @@ +import pytest +from mellea.backends.openai import OpenAIBackend +from mellea.stdlib.base import Document +from mellea.stdlib.chat import Message + +def test_message_with_docs(): + doc = Document("I'm text!", "Im a title!") + msg = Message("user", "hello", documents=[doc]) + + assert msg._docs is not None + assert doc in msg._docs + + docs = OpenAIBackend.messages_to_docs([msg]) + assert len(docs) == 1 + assert docs[0]["text"] == doc.text + assert docs[0]["title"] == doc.title + + assert "Im a titl..." in str(msg) + + tr = msg.format_for_llm() + assert tr.args["documents"] + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/uv.lock b/uv.lock index db659e41..9a7567b1 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'darwin'", @@ -1525,6 +1525,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" }, ] +[[package]] +name = "granite-common" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jsonschema" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/70/9b/c0e846c0517c7581e63901f90f23a28aa6758ec2b686171a5f23ba73ae48/granite_common-0.3.2.tar.gz", hash = "sha256:f5bc850573700f160bab0ae921d5156a5fd4594ed6806ae82ddf8a6043aa4331", size = 273140, upload-time = "2025-10-24T19:29:51.449Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/54/cf965d50fe493f4fb8ab1d4dea371a5d9d74d8e0bfdf2bd17f41a3af973b/granite_common-0.3.2-py3-none-any.whl", hash = "sha256:45d5a99e264f9e009215daf039c85a1e1ea216983962bcff2c75b4fa4a815d87", size = 77490, upload-time = "2025-10-24T19:29:49.97Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.71.0" @@ -2829,6 +2842,7 @@ dependencies = [ { name = "ansicolors" }, { name = "click" }, { name = "fastapi" }, + { name = "granite-common" }, { name = "huggingface-hub" }, { name = "jinja2" }, { name = "json5" }, @@ -2923,6 +2937,7 @@ requires-dist = [ { name = "datasets", marker = "extra == 'hf'", specifier = ">=4.0.0" }, { name = "docling", marker = "extra == 'docling'", specifier = ">=2.45.0" }, { name = "fastapi" }, + { name = "granite-common" }, { name = "huggingface-hub", specifier = ">=0.33.4" }, { name = "ibm-watsonx-ai", marker = "extra == 'watsonx'", specifier = ">=1.3.31" }, { name = "jinja2" }, @@ -2934,10 +2949,10 @@ requires-dist = [ { name = "numpy", marker = "extra == 'vllm'", specifier = "<2.0.0" }, { name = "ollama", specifier = ">=0.5.1" }, { name = "openai" }, + { name = "peft", marker = "extra == 'hf'", git = "https://github.com/huggingface/peft.git?rev=293aea5df6db240856a77f89955d1a89ce38b50d" }, { name = "outlines", marker = "extra == 'hf'" }, { name = "outlines-core", marker = "extra == 'hf'", specifier = "==0.1.26" }, { name = "outlines-core", marker = "extra == 'vllm'", specifier = "==0.1.26" }, - { name = "peft", marker = "extra == 'hf'", specifier = ">=0.16.0" }, { name = "pillow" }, { name = "pydantic" }, { name = "requests", specifier = ">=2.32.3" }, @@ -4127,8 +4142,8 @@ wheels = [ [[package]] name = "peft" -version = "0.17.1" -source = { registry = "https://pypi.org/simple" } +version = "0.17.2.dev0" +source = { git = "https://github.com/huggingface/peft.git?rev=293aea5df6db240856a77f89955d1a89ce38b50d#293aea5df6db240856a77f89955d1a89ce38b50d" } dependencies = [ { name = "accelerate" }, { name = "huggingface-hub" }, @@ -4141,10 +4156,6 @@ dependencies = [ { name = "tqdm" }, { name = "transformers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/70/b8/2e79377efaa1e5f0d70a497db7914ffd355846e760ffa2f7883ab0f600fb/peft-0.17.1.tar.gz", hash = "sha256:e6002b42517976c290b3b8bbb9829a33dd5d470676b2dec7cb4df8501b77eb9f", size = 568192, upload-time = "2025-08-21T09:25:22.703Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/49/fe/a2da1627aa9cb6310b6034598363bd26ac301c4a99d21f415b1b2855891e/peft-0.17.1-py3-none-any.whl", hash = "sha256:3d129d64def3d74779c32a080d2567e5f7b674e77d546e3585138216d903f99e", size = 504896, upload-time = "2025-08-21T09:25:18.974Z" }, -] [[package]] name = "pexpect"