diff --git a/src/lmstudio/json_api.py b/src/lmstudio/json_api.py index ab83b04..d57037c 100644 --- a/src/lmstudio/json_api.py +++ b/src/lmstudio/json_api.py @@ -123,6 +123,7 @@ # explicitly via `lmstudio.json_api`, it isn't exported # implicitly as part of the top-level `lmstudio` API. __all__ = [ + "ActResult", "AnyModelSpecifier", "EmbeddingModelInfo", "EmbeddingModelInstanceInfo", @@ -151,7 +152,6 @@ "ModelSpecifierDict", "ModelQuery", "ModelQueryDict", - "OperationResult", "PredictionResult", "PredictionRoundResult", "SerializedLMSExtendedError", @@ -455,9 +455,9 @@ def _to_history_content(self) -> str: @dataclass(kw_only=True, frozen=True, slots=True) class PredictionRoundResult(PredictionResult[str]): - """The result of a prediction within a multi-round tool using operation.""" + """The result of a prediction within a multi-round tool using action.""" - round_index: int # The round within the operation that produced this result + round_index: int # The round within the action that produced this result @classmethod def from_result(cls, result: PredictionResult[str], round_index: int) -> Self: @@ -471,10 +471,10 @@ def from_result(cls, result: PredictionResult[str], round_index: int) -> Self: @dataclass(kw_only=True, frozen=True, slots=True) -class OperationResult: - """Summary of a completed multi-round tool using operation.""" +class ActResult: + """Summary of a completed multi-round tool using action.""" - # Actual operation output is reported via callbacks + # Detailed action results are reported via callbacks (for now) # fmt: off rounds: int @@ -1073,7 +1073,7 @@ def __init__( on_first_token: Callable[[], None] | None = None, on_prediction_fragment: Callable[[LlmPredictionFragment], None] | None = None, on_prompt_processing_progress: Callable[[float], None] | None = None, - # The remaining options are only relevant for multi-round tool operations + # The remaining options are only relevant for multi-round tool actions handle_invalid_tool_request: Callable[ [LMStudioPredictionError, _ToolCallRequest | None], str ] @@ -1359,7 +1359,7 @@ def parse_tools( """Split tool function definitions into server and client details.""" if not tools: raise LMStudioValueError( - "Tool operation requires at least one tool to be defined." + "Tool using actions require at least one tool to be defined." ) llm_tool_defs: list[LlmTool] = [] client_tool_map: dict[str, ClientToolSpec] = {} diff --git a/src/lmstudio/sync_api.py b/src/lmstudio/sync_api.py index 1b10cf9..ddbd899 100644 --- a/src/lmstudio/sync_api.py +++ b/src/lmstudio/sync_api.py @@ -53,6 +53,7 @@ _ToolCallRequest, ) from .json_api import ( + ActResult, AnyModelSpecifier, AvailableModelBase, ChannelEndpoint, @@ -85,7 +86,6 @@ ModelSessionTypes, ModelTypesEmbedding, ModelTypesLlm, - OperationResult, PredictionEndpoint, PredictionFragmentEvent, PredictionResult, @@ -1539,7 +1539,7 @@ def respond( # Multi-round predictions are currently a sync-only handle-only feature # TODO: Refactor to allow for more code sharing with the async API @sdk_public_api() - def operate( + def act( self, chat: Chat | ChatHistoryDataDict | str, tools: Iterable[ToolFunctionDef | ToolFunctionDefDict], @@ -1559,14 +1559,14 @@ def operate( [LMStudioPredictionError, _ToolCallRequest | None], str ] | None = None, - ) -> OperationResult: - """Request a response (with implicit tool use) in an ongoing assistant chat session.""" - operation_start_time = time.perf_counter() + ) -> ActResult: + """Request a response (with implicit tool use) in an ongoing agent chat session.""" + start_time = time.perf_counter() # It is not yet possible to combine tool calling with requests for structured responses response_format = None if isinstance(chat, Chat): chat._fetch_file_handles(self._session._fetch_file_handle) - op_chat: Chat = Chat.from_history(chat) + agent_chat: Chat = Chat.from_history(chat) del chat # Multiple rounds, until all tool calls are resolved or limit is reached round_counter: Iterable[int] @@ -1622,9 +1622,11 @@ def _wrapped_on_prompt_processing_progress(progress: float) -> None: # Update the endpoint definition on each iteration in order to: # * update the chat history with the previous round result # * be able to disallow tool use when the rounds are limited + # TODO: Refactor endpoint API to avoid repeatedly performing the + # LlmPredictionConfig -> KvConfigStack transformation endpoint = ChatResponseEndpoint( self.identifier, - op_chat, + agent_chat, response_format, config, None, # Multiple messages are generated per round @@ -1658,23 +1660,29 @@ def _wrapped_on_prompt_processing_progress(progress: float) -> None: tool_results = [ fut.result() for fut in as_completed(pending_tool_calls) ] - requests_message = op_chat._add_assistant_tool_requests( + requests_message = agent_chat._add_assistant_tool_requests( prediction, tool_call_requests ) - results_message = op_chat._add_tool_results(tool_results) + results_message = agent_chat._add_tool_results(tool_results) if on_message is not None: on_message(requests_message) on_message(results_message) elif on_message is not None: - on_message(op_chat.add_assistant_response(prediction)) + on_message(agent_chat.add_assistant_response(prediction)) if on_round_end is not None: on_round_end(round_index) if not tool_call_requests: # No tool call requests -> we're done here break + if round_index == final_round_index: + # We somehow received at least one tool call request, + # even though tools are omitted on the final round + err_msg = "Model requested tool use on final prediction round." + endpoint._handle_invalid_tool_request(err_msg) + break num_rounds = round_index + 1 - duration = time.perf_counter() - operation_start_time - return OperationResult(rounds=num_rounds, total_time_seconds=duration) + duration = time.perf_counter() - start_time + return ActResult(rounds=num_rounds, total_time_seconds=duration) @sdk_public_api() def apply_prompt_template( diff --git a/tests/test_inference.py b/tests/test_inference.py index 35e6d2c..445e7b5 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -162,7 +162,7 @@ def test_duplicate_tool_names_rejected() -> None: @pytest.mark.lmstudio -def test_tool_operation(caplog: LogCap) -> None: +def test_tool_using_agent(caplog: LogCap) -> None: # This is currently a sync-only API (it will be refactored after 1.0.0) caplog.set_level(logging.DEBUG) @@ -177,9 +177,9 @@ def test_tool_operation(caplog: LogCap) -> None: # Ensure ignoring the round index passes static type checks predictions: list[PredictionResult[str]] = [] - op_result = llm.operate(chat, tools, on_prediction_completed=predictions.append) + act_result = llm.act(chat, tools, on_prediction_completed=predictions.append) assert len(predictions) > 1 - assert op_result.rounds == len(predictions) + assert act_result.rounds == len(predictions) assert "220" in predictions[-1].content for _logger_name, log_level, message in caplog.record_tuples: @@ -194,7 +194,7 @@ def test_tool_operation(caplog: LogCap) -> None: @pytest.mark.lmstudio -def test_tool_operation_callbacks(caplog: LogCap) -> None: +def test_tool_using_agent_callbacks(caplog: LogCap) -> None: # This is currently a sync-only API (it will be refactored after 1.0.0) caplog.set_level(logging.DEBUG) @@ -222,7 +222,7 @@ def _append_fragment(f: LlmPredictionFragment, round_index: int) -> None: # TODO: Also check on_prompt_processing_progress and handling invalid messages # (although it isn't clear how to provoke calls to the latter without mocking) - op_result = llm.operate( + act_result = llm.act( chat, tools, on_first_token=first_tokens.append, @@ -232,7 +232,7 @@ def _append_fragment(f: LlmPredictionFragment, round_index: int) -> None: on_round_end=round_ends.append, on_prediction_completed=predictions.append, ) - num_rounds = op_result.rounds + num_rounds = act_result.rounds sequential_round_indices = list(range(num_rounds)) assert num_rounds > 1 assert [p.round_index for p in predictions] == sequential_round_indices