@@ -853,8 +853,19 @@ type RealtimeSessionCreateRequestParam struct {
853853 ToolChoice RealtimeToolChoiceConfigUnionParam `json:"tool_choice,omitzero"`
854854 // Tools available to the model.
855855 Tools RealtimeToolsConfigParam `json:"tools,omitzero"`
856- // Controls how the realtime conversation is truncated prior to model inference.
857- // The default is `auto`.
856+ // When the number of tokens in a conversation exceeds the model's input token
857+ // limit, the conversation be truncated, meaning messages (starting from the
858+ // oldest) will not be included in the model's context. A 32k context model with
859+ // 4,096 max output tokens can only include 28,224 tokens in the context before
860+ // truncation occurs. Clients can configure truncation behavior to truncate with a
861+ // lower max token limit, which is an effective way to control token usage and
862+ // cost. Truncation will reduce the number of cached tokens on the next turn
863+ // (busting the cache), since messages are dropped from the beginning of the
864+ // context. However, clients can also configure truncation to retain messages up to
865+ // a fraction of the maximum context size, which will reduce the need for future
866+ // truncations and thus improve the cache rate. Truncation can be disabled
867+ // entirely, which means the server will never truncate but would instead return an
868+ // error if the conversation exceeds the model's input token limit.
858869 Truncation RealtimeTruncationUnionParam `json:"truncation,omitzero"`
859870 // The type of session to create. Always `realtime` for the Realtime API.
860871 //
@@ -1697,10 +1708,13 @@ type RealtimeTruncationUnion struct {
16971708 RetentionRatio float64 `json:"retention_ratio"`
16981709 // This field is from variant [RealtimeTruncationRetentionRatio].
16991710 Type constant.RetentionRatio `json:"type"`
1700- JSON struct {
1711+ // This field is from variant [RealtimeTruncationRetentionRatio].
1712+ TokenLimits RealtimeTruncationRetentionRatioTokenLimits `json:"token_limits"`
1713+ JSON struct {
17011714 OfRealtimeTruncationStrategy respjson.Field
17021715 RetentionRatio respjson.Field
17031716 Type respjson.Field
1717+ TokenLimits respjson.Field
17041718 raw string
17051719 } `json:"-"`
17061720}
@@ -1778,15 +1792,21 @@ func (u *RealtimeTruncationUnionParam) asAny() any {
17781792// input token limit. This allows you to amortize truncations across multiple
17791793// turns, which can help improve cached token usage.
17801794type RealtimeTruncationRetentionRatio struct {
1781- // Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the
1782- // conversation exceeds the input token limit.
1795+ // Fraction of post-instruction conversation tokens to retain (`0.0` - `1.0`) when
1796+ // the conversation exceeds the input token limit. Setting this to `0.8` means that
1797+ // messages will be dropped until 80% of the maximum allowed tokens are used. This
1798+ // helps reduce the frequency of truncations and improve cache rates.
17831799 RetentionRatio float64 `json:"retention_ratio,required"`
17841800 // Use retention ratio truncation.
17851801 Type constant.RetentionRatio `json:"type,required"`
1802+ // Optional custom token limits for this truncation strategy. If not provided, the
1803+ // model's default token limits will be used.
1804+ TokenLimits RealtimeTruncationRetentionRatioTokenLimits `json:"token_limits"`
17861805 // JSON contains metadata for fields, check presence with [respjson.Field.Valid].
17871806 JSON struct {
17881807 RetentionRatio respjson.Field
17891808 Type respjson.Field
1809+ TokenLimits respjson.Field
17901810 ExtraFields map [string ]respjson.Field
17911811 raw string
17921812 } `json:"-"`
@@ -1808,15 +1828,43 @@ func (r RealtimeTruncationRetentionRatio) ToParam() RealtimeTruncationRetentionR
18081828 return param.Override [RealtimeTruncationRetentionRatioParam ](json .RawMessage (r .RawJSON ()))
18091829}
18101830
1831+ // Optional custom token limits for this truncation strategy. If not provided, the
1832+ // model's default token limits will be used.
1833+ type RealtimeTruncationRetentionRatioTokenLimits struct {
1834+ // Maximum tokens allowed in the conversation after instructions (which including
1835+ // tool definitions). For example, setting this to 5,000 would mean that truncation
1836+ // would occur when the conversation exceeds 5,000 tokens after instructions. This
1837+ // cannot be higher than the model's context window size minus the maximum output
1838+ // tokens.
1839+ PostInstructions int64 `json:"post_instructions"`
1840+ // JSON contains metadata for fields, check presence with [respjson.Field.Valid].
1841+ JSON struct {
1842+ PostInstructions respjson.Field
1843+ ExtraFields map [string ]respjson.Field
1844+ raw string
1845+ } `json:"-"`
1846+ }
1847+
1848+ // Returns the unmodified JSON received from the API
1849+ func (r RealtimeTruncationRetentionRatioTokenLimits ) RawJSON () string { return r .JSON .raw }
1850+ func (r * RealtimeTruncationRetentionRatioTokenLimits ) UnmarshalJSON (data []byte ) error {
1851+ return apijson .UnmarshalRoot (data , r )
1852+ }
1853+
18111854// Retain a fraction of the conversation tokens when the conversation exceeds the
18121855// input token limit. This allows you to amortize truncations across multiple
18131856// turns, which can help improve cached token usage.
18141857//
18151858// The properties RetentionRatio, Type are required.
18161859type RealtimeTruncationRetentionRatioParam struct {
1817- // Fraction of post-instruction conversation tokens to retain (0.0 - 1.0) when the
1818- // conversation exceeds the input token limit.
1860+ // Fraction of post-instruction conversation tokens to retain (`0.0` - `1.0`) when
1861+ // the conversation exceeds the input token limit. Setting this to `0.8` means that
1862+ // messages will be dropped until 80% of the maximum allowed tokens are used. This
1863+ // helps reduce the frequency of truncations and improve cache rates.
18191864 RetentionRatio float64 `json:"retention_ratio,required"`
1865+ // Optional custom token limits for this truncation strategy. If not provided, the
1866+ // model's default token limits will be used.
1867+ TokenLimits RealtimeTruncationRetentionRatioTokenLimitsParam `json:"token_limits,omitzero"`
18201868 // Use retention ratio truncation.
18211869 //
18221870 // This field can be elided, and will marshal its zero value as "retention_ratio".
@@ -1831,3 +1879,23 @@ func (r RealtimeTruncationRetentionRatioParam) MarshalJSON() (data []byte, err e
18311879func (r * RealtimeTruncationRetentionRatioParam ) UnmarshalJSON (data []byte ) error {
18321880 return apijson .UnmarshalRoot (data , r )
18331881}
1882+
1883+ // Optional custom token limits for this truncation strategy. If not provided, the
1884+ // model's default token limits will be used.
1885+ type RealtimeTruncationRetentionRatioTokenLimitsParam struct {
1886+ // Maximum tokens allowed in the conversation after instructions (which including
1887+ // tool definitions). For example, setting this to 5,000 would mean that truncation
1888+ // would occur when the conversation exceeds 5,000 tokens after instructions. This
1889+ // cannot be higher than the model's context window size minus the maximum output
1890+ // tokens.
1891+ PostInstructions param.Opt [int64 ] `json:"post_instructions,omitzero"`
1892+ paramObj
1893+ }
1894+
1895+ func (r RealtimeTruncationRetentionRatioTokenLimitsParam ) MarshalJSON () (data []byte , err error ) {
1896+ type shadow RealtimeTruncationRetentionRatioTokenLimitsParam
1897+ return param .MarshalObject (r , (* shadow )(& r ))
1898+ }
1899+ func (r * RealtimeTruncationRetentionRatioTokenLimitsParam ) UnmarshalJSON (data []byte ) error {
1900+ return apijson .UnmarshalRoot (data , r )
1901+ }
0 commit comments