biocypher
diff --git a/‎biochatter/llm_connect/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎biochatter/llm_connect/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎biochatter/llm_connect/anthropic.py‎
Lines changed: 7 additions & 1 deletion b/‎biochatter/llm_connect/anthropic.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎biochatter/llm_connect/conversation.py‎
Lines changed: 202 additions & 9 deletions b/‎biochatter/llm_connect/conversation.py‎
Lines changed: 202 additions & 9 deletions
diff --git a/‎biochatter/llm_connect/gemini.py‎
Lines changed: 9 additions & 2 deletions b/‎biochatter/llm_connect/gemini.py‎
Lines changed: 9 additions & 2 deletions
@@ -9,6 +9,7 @@
 from biochatter.llm_connect.misc import BloomConversation, WasmConversation
 from biochatter.llm_connect.ollama import OllamaConversation
 from biochatter.llm_connect.openai import GptConversation
+from biochatter.llm_connect.openrouter import OpenRouterConversation
 from biochatter.llm_connect.xinference import XinferenceConversation
 
 __all__ = [
@@ -21,6 +22,7 @@
     "LangChainConversation",
     "LiteLLMConversation",
     "OllamaConversation",
+    "OpenRouterConversation",
     "WasmConversation",
     "XinferenceConversation",
 ]
@@ -1,3 +1,5 @@
+import warnings
+
 import anthropic
 from langchain_anthropic import ChatAnthropic
 from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
@@ -104,6 +106,9 @@ def _primary_query(self, **kwargs) -> tuple:
                 the token usage.
 
         """
+        if kwargs:
+            warnings.warn(f"Warning: {kwargs} are not used by this class", UserWarning)
+
         try:
             history = self._create_history()
             response = self.chat.generate([history])
@@ -126,7 +131,8 @@ def _primary_query(self, **kwargs) -> tuple:
             return str(e), None
 
         msg = response.generations[0][0].text
-        token_usage = response.llm_output.get("token_usage")
+        token_usage_raw = response.llm_output.get("token_usage")
+        token_usage = self._extract_total_tokens(token_usage_raw)
 
         self.append_ai_message(msg)
 
 
@@ -110,6 +110,7 @@ def __init__(
         tool_call_mode: Literal["auto", "text"] = "auto",
         mcp: bool = False,
         additional_tools_instructions: str = None,
+        force_tool: bool = False,
     ) -> None:
         super().__init__()
         self.model_name = model_name
@@ -130,6 +131,7 @@ def __init__(
         self.tools_prompt = None
         self.mcp = mcp
         self.additional_tools_instructions = additional_tools_instructions if additional_tools_instructions else ""
+        self.force_tool = force_tool
 
     @property
     def chat(self):
@@ -194,6 +196,188 @@ def find_rag_agent(self, mode: str) -> tuple[int, RagAgent]:
                 return i, val
         return -1, None
 
+    def _extract_total_tokens(self, token_usage: dict | int | None) -> int | None:
+        """Extract total tokens from various token usage formats.
+
+        This method standardizes token counting across different providers:
+        - OpenAI/Azure: {"prompt_tokens": X, "completion_tokens": Y, "total_tokens": Z}
+        - Anthropic: {"input_tokens": X, "output_tokens": Y} -> calculate total
+        - Gemini: {"total_tokens": Z} -> extract total
+        - Ollama: integer (eval_count) -> return as is
+        - LiteLLM: {"input_tokens": X, "output_tokens": Y, "total_tokens": Z}
+        - Others: try to extract or calculate total
+
+        Args:
+        ----
+            token_usage: Token usage in various formats (dict, int, or None)
+
+        Returns:
+        -------
+            int | None: Total token count, or None if not available
+
+        """
+        if token_usage is None:
+            return None
+
+        # Handle integer token counts (Ollama, some others)
+        if isinstance(token_usage, int):
+            return token_usage
+
+        # Handle dictionary token counts
+        if isinstance(token_usage, dict):
+            # First try to get total_tokens directly
+            if "total_tokens" in token_usage:
+                return token_usage["total_tokens"]
+
+            # Calculate from input/output tokens (Anthropic style)
+            if "input_tokens" in token_usage and "output_tokens" in token_usage:
+                return token_usage["input_tokens"] + token_usage["output_tokens"]
+
+            # Calculate from prompt/completion tokens (OpenAI style fallback)
+            if "prompt_tokens" in token_usage and "completion_tokens" in token_usage:
+                return token_usage["prompt_tokens"] + token_usage["completion_tokens"]
+
+            # If only one type of token count is available, use it
+            if "input_tokens" in token_usage:
+                return token_usage["input_tokens"]
+            if "output_tokens" in token_usage:
+                return token_usage["output_tokens"]
+            if "prompt_tokens" in token_usage:
+                return token_usage["prompt_tokens"]
+            if "completion_tokens" in token_usage:
+                return token_usage["completion_tokens"]
+
+        # If we can't extract meaningful token count, return None
+        return None
+
+    def _extract_input_tokens(self, token_usage: dict | int | None) -> int | None:
+        """Extract input tokens from various token usage formats.
+
+        This method standardizes input token counting across different providers:
+        - OpenAI/Azure: {"prompt_tokens": X, "completion_tokens": Y, "total_tokens": Z}
+        - Anthropic: {"input_tokens": X, "output_tokens": Y}
+        - Gemini: {"prompt_tokens": X, "candidates_tokens": Y, "total_tokens": Z}
+        - LiteLLM: {"input_tokens": X, "output_tokens": Y, "total_tokens": Z}
+        - Others: try to extract input/prompt tokens
+
+        Args:
+        ----
+            token_usage: Token usage in various formats (dict, int, or None)
+
+        Returns:
+        -------
+            int | None: Input token count, or None if not available
+
+        """
+        if token_usage is None:
+            return None
+
+        # Handle integer token counts (cannot distinguish input vs output)
+        if isinstance(token_usage, int):
+            return None
+
+        # Handle dictionary token counts
+        if isinstance(token_usage, dict):
+            # First try to get input_tokens (Anthropic, LiteLLM style)
+            if "input_tokens" in token_usage:
+                return token_usage["input_tokens"]
+
+            # Try prompt_tokens (OpenAI style)
+            if "prompt_tokens" in token_usage:
+                return token_usage["prompt_tokens"]
+
+        # If we can't extract meaningful input token count, return None
+        return None
+
+    def _extract_output_tokens(self, token_usage: dict | int | None) -> int | None:
+        """Extract output tokens from various token usage formats.
+
+        This method standardizes output token counting across different providers:
+        - OpenAI/Azure: {"prompt_tokens": X, "completion_tokens": Y, "total_tokens": Z}
+        - Anthropic: {"input_tokens": X, "output_tokens": Y}
+        - Gemini: {"prompt_tokens": X, "candidates_tokens": Y, "total_tokens": Z}
+        - LiteLLM: {"input_tokens": X, "output_tokens": Y, "total_tokens": Z}
+        - Others: try to extract output/completion tokens
+
+        Args:
+        ----
+            token_usage: Token usage in various formats (dict, int, or None)
+
+        Returns:
+        -------
+            int | None: Output token count, or None if not available
+
+        """
+        if token_usage is None:
+            return None
+
+        # Handle integer token counts (cannot distinguish input vs output)
+        if isinstance(token_usage, int):
+            return None
+
+        # Handle dictionary token counts
+        if isinstance(token_usage, dict):
+            # First try to get output_tokens (Anthropic, LiteLLM style)
+            if "output_tokens" in token_usage:
+                return token_usage["output_tokens"]
+
+            # Try completion_tokens (OpenAI style)
+            if "completion_tokens" in token_usage:
+                return token_usage["completion_tokens"]
+
+            # Try candidates_tokens (Gemini style)
+            if "candidates_tokens" in token_usage:
+                return token_usage["candidates_tokens"]
+
+        # If we can't extract meaningful output token count, return None
+        return None
+
+    def compute_cumulative_token_usage(self) -> dict:
+        """Compute the token usage by looping over the messages.
+
+        Extracts token usage information from each message's usage_metadata and
+        computes running cumulative totals throughout the conversation.
+        Handles various token usage formats from different LLM providers.
+
+        Returns
+        -------
+            dict: Token usage information with lists of running totals:
+                - "total_tokens": list[int] - running total at each message
+                - "input_tokens": list[int] - running input total at each message
+                - "output_tokens": list[int] - running output total at each message
+
+        """
+        # Initialize data structures
+        individual_usage = {
+            "total_tokens": [],
+            "input_tokens": [],
+            "output_tokens": [],
+        }
+
+        # Extract individual token counts for each AI message
+        for message in self.messages:
+            if isinstance(message, AIMessage):
+                usage_metadata = getattr(message, "usage_metadata", None)
+                individual_usage["total_tokens"].append(self._extract_total_tokens(usage_metadata))
+                individual_usage["input_tokens"].append(self._extract_input_tokens(usage_metadata))
+                individual_usage["output_tokens"].append(self._extract_output_tokens(usage_metadata))
+
+        # Compute running cumulative totals for each message
+        per_message_cumulative = {
+            "total_tokens": [],
+            "input_tokens": [],
+            "output_tokens": [],
+        }
+
+        for token_type in ["total_tokens", "input_tokens", "output_tokens"]:
+            running_total = 0
+            for count in individual_usage[token_type]:
+                if count is not None:
+                    running_total += count
+                per_message_cumulative[token_type].append(running_total)
+
+        return per_message_cumulative
+
     @abstractmethod
     def set_api_key(self, api_key: str, user: str | None = None) -> None:
         """Set the API key."""
@@ -253,19 +437,24 @@ def bind_tools(self, tools: list[Callable]) -> None:
         # If not, fail gracefully
         # raise ValueError(f"Model {self.model_name} does not support tool calling.")
 
-    def append_ai_message(self, message: str) -> None:
+    def append_ai_message(self, message: str | AIMessage) -> None:
         """Add a message from the AI to the conversation.
 
         Args:
         ----
             message (str): The message from the AI.
 
         """
-        self.messages.append(
-            AIMessage(
-                content=message,
-            ),
-        )
+        if isinstance(message, AIMessage):
+            self.messages.append(message)
+        elif isinstance(message, str):
+            self.messages.append(
+                AIMessage(
+                    content=message,
+                ),
+            )
+        else:
+            raise ValueError(f"Invalid message type: {type(message)}")
 
     def append_system_message(self, message: str) -> None:
         """Add a system message to the conversation.
@@ -473,9 +662,13 @@ def query(
             track_tool_calls=track_tool_calls,
         )
 
+        # case of structured output
+        if (token_usage == -1) and structured_model:
+            return (msg, 0, None)
+
         if not token_usage:
             # indicates error
-            return (msg, token_usage, None)
+            return (msg, None, None)
 
         if not self.correct:
             return (msg, token_usage, None)
@@ -712,7 +905,7 @@ def _process_tool_calls(
                             additional_instructions=self.additional_instructions_tool_interpretation,
                         )
                     )
-                    self.append_ai_message(tool_result_interpretation.content)
+                    self.messages.append(tool_result_interpretation)
                     msg += f"\nTool results interpretation: {tool_result_interpretation.content}"
                 else:
                     # Single tool: explain individual result (maintain current behavior)
@@ -725,7 +918,7 @@ def _process_tool_calls(
                             additional_instructions=self.additional_instructions_tool_interpretation,
                         )
                     )
-                    self.append_ai_message(tool_result_interpretation.content)
+                    self.messages.append(tool_result_interpretation)
                     msg += f"\nTool result interpretation: {tool_result_interpretation.content}"
 
             return msg
 
@@ -1,3 +1,4 @@
+import warnings
 from collections.abc import Callable
 from typing import Literal
 
@@ -119,6 +120,10 @@ def _primary_query(self, tools: list[Callable] | None = None, **kwargs) -> tuple
                 the token usage.
 
         """
+        if kwargs:
+            kwargs.pop("tools", None)
+            warnings.warn(f"Warning: {kwargs} are not used by this class", UserWarning)
+
         # bind tools to the chat if provided in the query
         chat = self.chat.bind_tools(tools) if (tools and self.model_name in TOOL_CALLING_MODELS) else self.chat
 
@@ -134,7 +139,8 @@ def _primary_query(self, tools: list[Callable] | None = None, **kwargs) -> tuple
             msg = response.content
             self.append_ai_message(msg)
 
-        token_usage = response.usage_metadata["total_tokens"]
+        token_usage_raw = response.usage_metadata
+        token_usage = self._extract_total_tokens(token_usage_raw)
 
         return msg, token_usage
 
@@ -171,6 +177,7 @@ def _correct_response(self, msg: str) -> str:
         response = self.ca_chat.invoke(ca_messages)
 
         correction = response.content
-        token_usage = response.usage_metadata["total_tokens"]
+        token_usage_raw = response.usage_metadata
+        token_usage = self._extract_total_tokens(token_usage_raw)
 
         return correction