Merge pull request #23 from nuhatech/dev

mazy06000 · web-flow · commit 3086b24e09f3 · 2026-02-11T05:53:47.000Z
Release v0.1.22: Added `use_max_completion_tokens` parameter to `Open…
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.22] - 2026-02-11
+
+### Added
+- **`use_max_completion_tokens` parameter for `OpenAILLM`**: Newer OpenAI models (o1, o3, gpt-5-nano, etc.) require `max_completion_tokens` instead of the deprecated `max_tokens`. Set `use_max_completion_tokens=True` to use the new parameter name. When `max_tokens` is `None`, the parameter is now omitted entirely instead of sending `null` (which some models reject).
+- **`use_max_completion_tokens` parameter for `AgenticQueryPipeline`**: Propagated to the internally created `OpenAILLM` when no custom LLM is provided.
+
 ## [0.1.21] - 2026-02-11
 
 ### Fixed
@@ -264,7 +270,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Example scripts for common use cases
 - API reference documentation
 
-[Unreleased]: https://github.com/nuhatech/maktaba/compare/v0.1.21...HEAD
+[Unreleased]: https://github.com/nuhatech/maktaba/compare/v0.1.22...HEAD
+[0.1.22]: https://github.com/nuhatech/maktaba/compare/v0.1.21...v0.1.22
 [0.1.21]: https://github.com/nuhatech/maktaba/compare/v0.1.20...v0.1.21
 [0.1.20]: https://github.com/nuhatech/maktaba/compare/v0.1.19...v0.1.20
 [0.1.19]: https://github.com/nuhatech/maktaba/compare/v0.1.18...v0.1.19
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "maktaba"
-version = "0.1.21"
+version = "0.1.22"
 description = "Production-ready RAG infrastructure for multilingual applications"
 authors = [
     {name = "NuhaTech", email = "contact@nuhatech.com"}
diff --git a/src/maktaba/llm/openai.py b/src/maktaba/llm/openai.py
@@ -36,6 +36,7 @@ def __init__(
         temperature: float = 0.0,
         timeout_s: float = 30.0,
         prompts: Optional[AgenticPrompts] = None,
+        use_max_completion_tokens: bool = False,
     ) -> None:
         """
         Initialize OpenAI LLM.
@@ -46,12 +47,16 @@ def __init__(
             temperature: Sampling temperature (default: 0 for deterministic)
             timeout_s: Request timeout in seconds
             prompts: Custom prompts for agentic operations (defaults to default_prompts())
+            use_max_completion_tokens: Use ``max_completion_tokens`` instead of
+                ``max_tokens`` in API calls. Required for newer OpenAI models
+                (o1, o3, gpt-5-nano, etc.) that no longer accept ``max_tokens``.
         """
         self.api_key = api_key
         self.model = model
         self.temperature = temperature
         self.timeout_s = timeout_s
         self.prompts = prompts or default_prompts()
+        self.use_max_completion_tokens = use_max_completion_tokens
         self._logger = get_logger("maktaba.llm.openai")
 
         # Lazy client initialization
@@ -70,6 +75,18 @@ def _get_client(self) -> Optional[Any]:
             self._client = self._OpenAI(api_key=self.api_key, timeout=self.timeout_s)
         return self._client
 
+    def _token_limit_kwargs(self, max_tokens: int | None) -> Dict[str, Any]:
+        """Build the token-limit keyword argument for the OpenAI API.
+
+        Returns an empty dict when *max_tokens* is ``None`` so the parameter
+        is omitted entirely (some models reject ``null``).  When a value is
+        provided the key name depends on :attr:`use_max_completion_tokens`.
+        """
+        if max_tokens is None:
+            return {}
+        key = "max_completion_tokens" if self.use_max_completion_tokens else "max_tokens"
+        return {key: max_tokens}
+
     def _format_chat_history(self, messages: List[Tuple[str, str]]) -> str:
         """Format chat history as text."""
         lines = []
@@ -102,7 +119,7 @@ async def complete_text(
                     {"role": "user", "content": prompt},
                 ],
                 temperature=temperature,
-                max_tokens=max_tokens,
+                **self._token_limit_kwargs(max_tokens),
             )
 
             usage = LLMUsage(
@@ -141,7 +158,7 @@ async def complete_json(
                     {"role": "user", "content": prompt},
                 ],
                 temperature=temperature,
-                max_tokens=max_tokens,
+                **self._token_limit_kwargs(max_tokens),
                 response_format={"type": "json_object"},
             )
 
@@ -182,7 +199,7 @@ async def stream_text(
                     {"role": "user", "content": prompt},
                 ],
                 temperature=temperature,
-                max_tokens=max_tokens,
+                **self._token_limit_kwargs(max_tokens),
                 stream=True,
             )
 
diff --git a/src/maktaba/pipeline/agentic.py b/src/maktaba/pipeline/agentic.py
@@ -43,6 +43,7 @@ def __init__(
         llm_model: str = "gpt-4o-mini",
         prompts: Optional[AgenticPrompts] = None,
         namespace: Optional[str] = None,
+        use_max_completion_tokens: bool = False,
     ) -> None:
         """
         Initialize agentic pipeline.
@@ -57,6 +58,9 @@ def __init__(
             llm_model: LLM model name
             prompts: Custom prompts for LLM operations (defaults to default_prompts())
             namespace: Default namespace for searches
+            use_max_completion_tokens: Use ``max_completion_tokens`` instead of
+                ``max_tokens`` in OpenAI API calls. Required for newer models
+                (o1, o3, gpt-5-nano, etc.). Only applies when *llm* is not provided.
 
         Example:
             # Use default prompts
@@ -85,7 +89,12 @@ def __init__(
         if llm is not None:
             self.llm = llm
         else:
-            self.llm = OpenAILLM(api_key=llm_api_key, model=llm_model, prompts=prompts)
+            self.llm = OpenAILLM(
+                api_key=llm_api_key,
+                model=llm_model,
+                prompts=prompts,
+                use_max_completion_tokens=use_max_completion_tokens,
+            )
 
     async def _execute_single_query(
         self,
diff --git a/uv.lock b/uv.lock