feat(arkruntime): support context rolling tokens

luminghao-bytedance · luminghao-bytedance · commit d08442b249f0 · 2024-12-17T19:45:52.000+08:00
diff --git a/volcenginesdkarkruntime/resources/context/context.py b/volcenginesdkarkruntime/resources/context/context.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 import httpx
 
-from typing import Iterable, Optional
+from typing import Iterable, Optional, Literal
 
 from ..._types import Body, Query, Headers
 from .completions import Completions, AsyncCompletions
@@ -13,7 +13,7 @@
 from ..._base_client import (
     make_request_options,
 )
-from ...types.context import CreateContextResponse, CloneContextResponse
+from ...types.context import CreateContextResponse
 from ...types.context.context_create_params import TTLTypes, TruncationStrategy, to_optional_ttl
 from ...types.chat import ChatCompletionMessageParam
 
@@ -32,6 +32,7 @@ def create(
             model: str,
             messages: Iterable[ChatCompletionMessageParam],
             ttl: Optional[TTLTypes] | None = None,
+            mode: Literal["session"] = "session",
             truncation_strategy: Optional[TruncationStrategy] | None = None,
             extra_headers: Headers | None = None,
             extra_query: Query | None = None,
@@ -43,6 +44,7 @@ def create(
             "/context/create",
             body={
                 "model": model,
+                "mode": mode,
                 "messages": messages,
                 "ttl": ttl,
                 "truncation_strategy": truncation_strategy,
@@ -56,30 +58,6 @@ def create(
             cast_to=CreateContextResponse,
         )
 
-    def clone(
-            self,
-            *,
-            context_id: str,
-            extra_headers: Headers | None = None,
-            extra_query: Query | None = None,
-            extra_body: Body | None = None,
-            timeout: float | httpx.Timeout | None = None,
-    ) -> CloneContextResponse:
-        return self._post(
-            "/context/clone",
-            body={
-                "context_id": context_id,
-            },
-            options=make_request_options(
-                extra_headers=extra_headers,
-                extra_query=extra_query,
-                extra_body=extra_body,
-                timeout=timeout,
-            ),
-            cast_to=CloneContextResponse,
-        )
-
-
 class AsyncContext(AsyncAPIResource):
     @cached_property
     def completions(self) -> AsyncCompletions:
@@ -90,6 +68,7 @@ async def create(
             self,
             *,
             model: str,
+            mode: Literal["session"] = "session",
             messages: Iterable[ChatCompletionMessageParam],
             ttl: Optional[TTLTypes] | None = None,
             truncation_strategy: Optional[TruncationStrategy] | None = None,
diff --git a/volcenginesdkarkruntime/types/context/__init__.py b/volcenginesdkarkruntime/types/context/__init__.py
@@ -4,5 +4,5 @@
 
 from .context_chat_completion_chunk import ContextChatCompletionChunk
 from .context_chat_completion import ContextChatCompletion
-from .create_context_response import CreateContextResponse, CloneContextResponse
+from .create_context_response import CreateContextResponse
 from .context_create_params import TruncationStrategy, TTLTypes
diff --git a/volcenginesdkarkruntime/types/context/context_create_params.py b/volcenginesdkarkruntime/types/context/context_create_params.py
@@ -12,10 +12,12 @@
 
 
 class TruncationStrategy(TypedDict, total=False):
-    type: Required[Literal["last_history_tokens"]]
+    type: Required[Literal["last_history_tokens", "rolling_tokens"]]
     """The truncation strategy to use for the context. The default is last_history_tokens."""
     last_history_tokens: Optional[int]
     """The number of most recent tokens from the context when constructing the chat completion."""
+    rolling_tokens: Optional[bool]
+    """If true, the context will not rolling when reach the max tokens limit."""
 
 
 TTLTypes = Union[int, datetime.timedelta]
diff --git a/volcenginesdkarkruntime/types/context/create_context_response.py b/volcenginesdkarkruntime/types/context/create_context_response.py
@@ -1,22 +1,25 @@
 from ..._models import BaseModel
+from ..completion_usage import CompletionUsage
 from .truncation_strategy import TruncationStrategy
 
-__all__ = ["CreateContextResponse", "CloneContextResponse"]
+__all__ = ["CreateContextResponse"]
 
 
 class CreateContextResponse(BaseModel):
     id: str
     """A unique identifier for the context."""
     model: str
     """The endpoint used for the context."""
+    mode: str
+    """The mode used for the context."""
     ttl: int
     """The time to live (TTL) for the context in seconds."""
     truncation_strategy: TruncationStrategy
     """
     Controls for how a context will be truncated prior to the run. 
     Use this to control the context window for the chat completion.
     """
-
-
-class CloneContextResponse(CreateContextResponse):
-    pass
+    usage: CompletionUsage
+    """
+    usage for context create.
+    """
diff --git a/volcenginesdkexamples/volcenginesdkarkruntime/context.py b/volcenginesdkexamples/volcenginesdkarkruntime/context.py
@@ -19,14 +19,11 @@
     print("----- create context -----")
     response = client.context.create(
         model="${YOUR_ENDPOINT_ID}",
+        mode="session",
         messages=[
             {"role": "system", "content": "你是豆包，是由字节跳动开发的 AI 人工智能助手"},
         ],
         ttl=datetime.timedelta(minutes=60),
-        truncation_strategy={
-            'type': 'last_history_tokens',
-            'last_history_tokens': 4096
-        }
     )
     print(response)
 
@@ -40,6 +37,7 @@
         stream=False
     )
     print(chat_response.choices[0].message.content)
+    print(chat_response.usage)
 
     print("----- chat round 2 (streaming) -----")
     stream = client.context.completions.create(
@@ -48,6 +46,9 @@
         messages=[
             {"role": "user", "content": "我是谁？"},
         ],
+        stream_options={
+            'include_usage': True,
+        },
         stream=True
     )
     for chunk in stream: