arc53 · dartpain · Apr 27, 2026 · Apr 27, 2026
diff --git a/application/agents/base.py b/application/agents/base.py
@@ -42,6 +42,7 @@ def __init__(
         llm_handler=None,
         tool_executor: Optional[ToolExecutor] = None,
         backup_models: Optional[List[str]] = None,
+        model_user_id: Optional[str] = None,
     ):
         self.endpoint = endpoint
         self.llm_name = llm_name
@@ -52,10 +53,13 @@ def __init__(
         self.prompt = prompt
         self.decoded_token = decoded_token or {}
         self.user: str = self.decoded_token.get("sub")
+        # BYOM-resolution scope: owner for shared agents, caller for
+        # caller-owned BYOM, None for built-ins. Falls back to self.user
+        # for worker/legacy callers that don't thread model_user_id.
+        self.model_user_id = model_user_id
         self.tools: List[Dict] = []
         self.chat_history: List[Dict] = chat_history if chat_history is not None else []
 
-        # Dependency injection for LLM — fall back to creating if not provided
         if llm is not None:
             self.llm = llm
         else:
@@ -67,8 +71,16 @@ def __init__(
                 model_id=model_id,
                 agent_id=agent_id,
                 backup_models=backup_models,
+                model_user_id=model_user_id,
             )
 
+        # For BYOM, registry id (UUID) differs from upstream model id
+        # (e.g. ``mistral-large-latest``). LLMCreator resolved this onto
+        # the LLM instance; cache it for subsequent gen calls.
+        self.upstream_model_id = (
+            getattr(self.llm, "model_id", None) or model_id
+        )
+
         self.retrieved_docs = retrieved_docs or []
 
         if llm_handler is not None:
@@ -306,7 +318,9 @@ def _check_context_limit(self, messages: List[Dict]) -> bool:
         try:
             current_tokens = self._calculate_current_context_tokens(messages)
             self.current_token_count = current_tokens
-            context_limit = get_token_limit(self.model_id)
+            context_limit = get_token_limit(
+                self.model_id, user_id=self.model_user_id or self.user
+            )
             threshold = int(context_limit * settings.COMPRESSION_THRESHOLD_PERCENTAGE)
 
             if current_tokens >= threshold:
@@ -325,7 +339,9 @@ def _validate_context_size(self, messages: List[Dict]) -> None:
 
         current_tokens = self._calculate_current_context_tokens(messages)
         self.current_token_count = current_tokens
-        context_limit = get_token_limit(self.model_id)
+        context_limit = get_token_limit(
+            self.model_id, user_id=self.model_user_id or self.user
+        )
         percentage = (current_tokens / context_limit) * 100
 
         if current_tokens >= context_limit:
@@ -387,7 +403,9 @@ def _build_messages(
             )
             system_prompt = system_prompt + compression_context
 
-        context_limit = get_token_limit(self.model_id)
+        context_limit = get_token_limit(
+            self.model_id, user_id=self.model_user_id or self.user
+        )
         system_tokens = num_tokens_from_string(system_prompt)
 
         safety_buffer = int(context_limit * 0.1)
@@ -497,7 +515,10 @@ def _truncate_history_to_fit(
     def _llm_gen(self, messages: List[Dict], log_context: Optional[LogContext] = None):
         self._validate_context_size(messages)
 
-        gen_kwargs = {"model": self.model_id, "messages": messages}
+        # Use the upstream id resolved by LLMCreator (see __init__).
+        # Built-in models: same as self.model_id. BYOM: the user's
+        # typed model name, not the internal UUID.
+        gen_kwargs = {"model": self.upstream_model_id, "messages": messages}
         if self.attachments:
             gen_kwargs["_usage_attachments"] = self.attachments
 

diff --git a/application/agents/research_agent.py b/application/agents/research_agent.py
@@ -312,7 +312,7 @@ def _clarification_phase(self, question: str) -> Optional[str]:
 
         try:
             response = self.llm.gen(
-                model=self.model_id,
+                model=self.upstream_model_id,
                 messages=messages,
                 tools=None,
                 response_format={"type": "json_object"},
@@ -390,7 +390,7 @@ def _planning_phase(self, question: str) -> tuple[List[Dict], str]:
 
         try:
             response = self.llm.gen(
-                model=self.model_id,
+                model=self.upstream_model_id,
                 messages=messages,
                 tools=None,
                 response_format={"type": "json_object"},
@@ -506,7 +506,7 @@ def _research_step_with_executor(
 
             try:
                 response = self.llm.gen(
-                    model=self.model_id,
+                    model=self.upstream_model_id,
                     messages=messages,
                     tools=self.tools if self.tools else None,
                 )
@@ -537,7 +537,7 @@ def _research_step_with_executor(
         )
         try:
             response = self.llm.gen(
-                model=self.model_id, messages=messages, tools=None
+                model=self.upstream_model_id, messages=messages, tools=None
             )
             self._track_tokens(self._snapshot_llm_tokens())
             text = self._extract_text(response)
@@ -664,7 +664,7 @@ def _synthesis_phase(
         ]
 
         llm_response = self.llm.gen_stream(
-            model=self.model_id, messages=messages, tools=None
+            model=self.upstream_model_id, messages=messages, tools=None
         )
 
         if log_context:

diff --git a/application/agents/tools/internal_search.py b/application/agents/tools/internal_search.py
@@ -39,6 +39,7 @@ def _get_retriever(self):
                 chunks=int(self.config.get("chunks", 2)),
                 doc_token_limit=int(self.config.get("doc_token_limit", 50000)),
                 model_id=self.config.get("model_id", "docsgpt-local"),
+                model_user_id=self.config.get("model_user_id"),
                 user_api_key=self.config.get("user_api_key"),
                 agent_id=self.config.get("agent_id"),
                 llm_name=self.config.get("llm_name", settings.LLM_PROVIDER),
@@ -435,6 +436,7 @@ def build_internal_tool_config(
     chunks: int = 2,
     doc_token_limit: int = 50000,
     model_id: str = "docsgpt-local",
+    model_user_id: Optional[str] = None,
     user_api_key: Optional[str] = None,
     agent_id: Optional[str] = None,
     llm_name: str = None,
@@ -449,6 +451,7 @@ def build_internal_tool_config(
         "chunks": chunks,
         "doc_token_limit": doc_token_limit,
         "model_id": model_id,
+        "model_user_id": model_user_id,
         "user_api_key": user_api_key,
         "agent_id": agent_id,
         "llm_name": llm_name or settings.LLM_PROVIDER,

diff --git a/application/agents/workflows/workflow_engine.py b/application/agents/workflows/workflow_engine.py
@@ -211,15 +211,26 @@ def _execute_agent_node(
             node_config.json_schema, node.title
         )
         node_model_id = node_config.model_id or self.agent.model_id
+        # Inherit BYOM scope from parent agent so owner-stored BYOM
+        # resolves on shared workflows.
+        node_user_id = getattr(self.agent, "model_user_id", None) or (
+            self.agent.decoded_token.get("sub")
+            if isinstance(self.agent.decoded_token, dict)
+            else None
+        )
         node_llm_name = (
             node_config.llm_name
-            or get_provider_from_model_id(node_model_id or "")
+            or get_provider_from_model_id(
+                node_model_id or "", user_id=node_user_id
+            )
             or self.agent.llm_name
         )
         node_api_key = get_api_key_for_provider(node_llm_name) or self.agent.api_key
 
         if node_json_schema and node_model_id:
-            model_capabilities = get_model_capabilities(node_model_id)
+            model_capabilities = get_model_capabilities(
+                node_model_id, user_id=node_user_id
+            )
             if model_capabilities and not model_capabilities.get(
                 "supports_structured_output", False
             ):
@@ -232,6 +243,7 @@ def _execute_agent_node(
             "endpoint": self.agent.endpoint,
             "llm_name": node_llm_name,
             "model_id": node_model_id,
+            "model_user_id": getattr(self.agent, "model_user_id", None),
             "api_key": node_api_key,
             "tool_ids": node_config.tools,
             "prompt": node_config.system_prompt,

diff --git a/application/alembic/versions/0003_user_custom_models.py b/application/alembic/versions/0003_user_custom_models.py
@@ -0,0 +1,65 @@
+"""0003 user_custom_models — per-user OpenAI-compatible model registrations.
+
+Revision ID: 0003_user_custom_models
+Revises: 0002_app_metadata
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+
+
+revision: str = "0003_user_custom_models"
+down_revision: Union[str, None] = "0002_app_metadata"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        CREATE TABLE user_custom_models (
+            id                  UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+            user_id             TEXT NOT NULL,
+            upstream_model_id   TEXT NOT NULL,
+            display_name        TEXT NOT NULL,
+            description         TEXT NOT NULL DEFAULT '',
+            base_url            TEXT NOT NULL,
+            api_key_encrypted   TEXT NOT NULL,
+            capabilities        JSONB NOT NULL DEFAULT '{}'::jsonb,
+            enabled             BOOLEAN NOT NULL DEFAULT true,
+            created_at          TIMESTAMPTZ NOT NULL DEFAULT now(),
+            updated_at          TIMESTAMPTZ NOT NULL DEFAULT now()
+        );
+        """
+    )
+    op.execute(
+        "CREATE INDEX user_custom_models_user_id_idx "
+        "ON user_custom_models (user_id);"
+    )
+
+    # Mirror the project-wide invariants set up in 0001_initial:
+    #   * user_id FK with ON DELETE RESTRICT (deferrable),
+    #   * ensure_user_exists() trigger so the parent users row autocreates,
+    #   * set_updated_at() trigger.
+    op.execute(
+        "ALTER TABLE user_custom_models "
+        "ADD CONSTRAINT user_custom_models_user_id_fk "
+        "FOREIGN KEY (user_id) REFERENCES users(user_id) "
+        "ON DELETE RESTRICT DEFERRABLE INITIALLY IMMEDIATE;"
+    )
+    op.execute(
+        "CREATE TRIGGER user_custom_models_ensure_user "
+        "BEFORE INSERT OR UPDATE OF user_id ON user_custom_models "
+        "FOR EACH ROW EXECUTE FUNCTION ensure_user_exists();"
+    )
+    op.execute(
+        "CREATE TRIGGER user_custom_models_set_updated_at "
+        "BEFORE UPDATE ON user_custom_models "
+        "FOR EACH ROW WHEN (OLD.* IS DISTINCT FROM NEW.*) "
+        "EXECUTE FUNCTION set_updated_at();"
+    )
+
+
+def downgrade() -> None:
+    op.execute("DROP TABLE IF EXISTS user_custom_models;")
diff --git a/application/api/answer/routes/base.py b/application/api/answer/routes/base.py
@@ -177,6 +177,7 @@ def complete_stream(
         is_shared_usage: bool = False,
         shared_token: Optional[str] = None,
         model_id: Optional[str] = None,
+        model_user_id: Optional[str] = None,
         _continuation: Optional[Dict] = None,
     ) -> Generator[str, None, None]:
         """
@@ -289,8 +290,18 @@ def complete_stream(
                     # conversation if this is the first turn.
                     if not conversation_id and should_save_conversation:
                         try:
+                            # Use model-owner scope so shared-agent
+                            # owner-BYOM resolves to its registered plugin.
                             provider = (
-                                get_provider_from_model_id(model_id)
+                                get_provider_from_model_id(
+                                    model_id,
+                                    user_id=model_user_id
+                                    or (
+                                        decoded_token.get("sub")
+                                        if decoded_token
+                                        else None
+                                    ),
+                                )
                                 if model_id
                                 else settings.LLM_PROVIDER
                             )
@@ -304,6 +315,7 @@ def complete_stream(
                                 decoded_token=decoded_token,
                                 model_id=model_id,
                                 agent_id=agent_id,
+                                model_user_id=model_user_id,
                             )
                             conversation_id = (
                                 self.conversation_service.save_conversation(
@@ -340,6 +352,9 @@ def complete_stream(
                                 tool_schemas=getattr(agent, "tools", []),
                                 agent_config={
                                     "model_id": model_id or self.default_model_id,
+                                    # Persist BYOM scope so resume doesn't
+                                    # fall back to caller's layer.
+                                    "model_user_id": model_user_id,
                                     "llm_name": getattr(agent, "llm_name", settings.LLM_PROVIDER),
                                     "api_key": getattr(agent, "api_key", None),
                                     "user_api_key": user_api_key,
@@ -370,8 +385,14 @@ def complete_stream(
             if isNoneDoc:
                 for doc in source_log_docs:
                     doc["source"] = "None"
+            # Run under model-owner scope so title-gen LLM inside
+            # save_conversation uses the owner's BYOM provider/key.
             provider = (
-                get_provider_from_model_id(model_id)
+                get_provider_from_model_id(
+                    model_id,
+                    user_id=model_user_id
+                    or (decoded_token.get("sub") if decoded_token else None),
+                )
                 if model_id
                 else settings.LLM_PROVIDER
             )
@@ -384,6 +405,7 @@ def complete_stream(
                 decoded_token=decoded_token,
                 model_id=model_id,
                 agent_id=agent_id,
+                model_user_id=model_user_id,
             )
 
             if should_save_conversation:
@@ -481,12 +503,34 @@ def complete_stream(
                     if isNoneDoc:
                         for doc in source_log_docs:
                             doc["source"] = "None"
+                    # Mirror the normal-path provider resolution so the
+                    # partial-save title LLM uses the model-owner's BYOM
+                    # registration (shared-agent dispatch) rather than
+                    # the deployment default with the instance api key.
+                    provider = (
+                        get_provider_from_model_id(
+                            model_id,
+                            user_id=model_user_id
+                            or (
+                                decoded_token.get("sub")
+                                if decoded_token
+                                else None
+                            ),
+                        )
+                        if model_id
+                        else settings.LLM_PROVIDER
+                    )
+                    sys_api_key = get_api_key_for_provider(
+                        provider or settings.LLM_PROVIDER
+                    )
                     llm = LLMCreator.create_llm(
-                        settings.LLM_PROVIDER,
-                        api_key=settings.API_KEY,
+                        provider or settings.LLM_PROVIDER,
+                        api_key=sys_api_key,
                         user_api_key=user_api_key,
                         decoded_token=decoded_token,
+                        model_id=model_id,
                         agent_id=agent_id,
+                        model_user_id=model_user_id,
                     )
                     self.conversation_service.save_conversation(
                         conversation_id,

diff --git a/application/api/answer/routes/stream.py b/application/api/answer/routes/stream.py
@@ -109,6 +109,7 @@ def post(self):
                         decoded_token=processor.decoded_token,
                         agent_id=processor.agent_id,
                         model_id=processor.model_id,
+                        model_user_id=processor.model_user_id,
                         _continuation={
                             "messages": messages,
                             "tools_dict": tools_dict,
@@ -145,6 +146,7 @@ def post(self):
                     is_shared_usage=processor.is_shared_usage,
                     shared_token=processor.shared_token,
                     model_id=processor.model_id,
+                    model_user_id=processor.model_user_id,
                 ),
                 mimetype="text/event-stream",
             )