Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/together/resources/chat/completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def create(
response_format: Dict[str, str | Dict[str, Any]] | None = None,
tools: List[Dict[str, Any]] | None = None,
tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
turboboost_tps: float | None = None,
**kwargs: Any,
) -> ChatCompletionResponse | Iterator[ChatCompletionChunk]:
"""
Expand Down Expand Up @@ -103,6 +104,13 @@ def create(
via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function.
Sets to `auto` if None.
Defaults to None.
turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between
the draft model (faster, lower quality) and target model (slower, higher quality).
Values range from 0.0 (regular speculative decoding with target model quality) to
1.0 (draft model speed with 100% acceptance rate). Higher values increase speed
while potentially reducing quality.
Defaults to 0.0.


Returns:
ChatCompletionResponse | Iterator[ChatCompletionChunk]: Object containing the completions
Expand Down Expand Up @@ -135,6 +143,7 @@ def create(
response_format=response_format,
tools=tools,
tool_choice=tool_choice,
turboboost_tps=turboboost_tps,
**kwargs,
).model_dump(exclude_none=True)

Expand Down Expand Up @@ -183,6 +192,7 @@ async def create(
response_format: Dict[str, Any] | None = None,
tools: Dict[str, str | Dict[str, str | Dict[str, Any]]] | None = None,
tool_choice: str | Dict[str, str | Dict[str, str]] | None = None,
turboboost_tps: float | None = None,
**kwargs: Any,
) -> AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse:
"""
Expand Down Expand Up @@ -245,6 +255,12 @@ async def create(
via {"type": "function", "function": {"name": "my_function"}} forces the model to call that function.
Sets to `auto` if None.
Defaults to None.
turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between
the draft model (faster, lower quality) and target model (slower, higher quality).
Values range from 0.0 (regular speculative decoding with target model quality) to
1.0 (draft model speed with 100% acceptance rate). Higher values increase speed
while potentially reducing quality.
Defaults to 0.0.

Returns:
AsyncGenerator[ChatCompletionChunk, None] | ChatCompletionResponse: Object containing the completions
Expand Down Expand Up @@ -277,6 +293,7 @@ async def create(
response_format=response_format,
tools=tools,
tool_choice=tool_choice,
turboboost_tps=turboboost_tps,
**kwargs,
).model_dump(exclude_none=True)

Expand Down
16 changes: 16 additions & 0 deletions src/together/resources/completions.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def create(
echo: bool | None = None,
n: int | None = None,
safety_model: str | None = None,
turboboost_tps: float | None = None,
**kwargs: Any,
) -> CompletionResponse | Iterator[CompletionChunk]:
"""
Expand Down Expand Up @@ -88,6 +89,12 @@ def create(
safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
Defaults to None.
turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between
the draft model (faster, lower quality) and target model (slower, higher quality).
Values range from 0.0 (regular speculative decoding with target model quality) to
1.0 (draft model speed with 100% acceptance rate). Higher values increase speed
while potentially reducing quality.
Defaults to 0.0.

Returns:
CompletionResponse | Iterator[CompletionChunk]: Object containing the completions
Expand Down Expand Up @@ -117,6 +124,7 @@ def create(
echo=echo,
n=n,
safety_model=safety_model,
turboboost_tps=turboboost_tps,
**kwargs,
).model_dump(exclude_none=True)

Expand Down Expand Up @@ -162,6 +170,7 @@ async def create(
echo: bool | None = None,
n: int | None = None,
safety_model: str | None = None,
turboboost_tps: float | None = None,
**kwargs: Any,
) -> AsyncGenerator[CompletionChunk, None] | CompletionResponse:
"""
Expand Down Expand Up @@ -212,6 +221,12 @@ async def create(
safety_model (str, optional): A moderation model to validate tokens. Choice between available moderation
models found [here](https://docs.together.ai/docs/inference-models#moderation-models).
Defaults to None.
turboboost_tps (float, optional): A parameter that controls the speed-quality tradeoff between
the draft model (faster, lower quality) and target model (slower, higher quality).
Values range from 0.0 (regular speculative decoding with target model quality) to
1.0 (draft model speed with 100% acceptance rate). Higher values increase speed
while potentially reducing quality.
Defaults to 0.0.

Returns:
AsyncGenerator[CompletionChunk, None] | CompletionResponse: Object containing the completions
Expand Down Expand Up @@ -241,6 +256,7 @@ async def create(
echo=echo,
n=n,
safety_model=safety_model,
turboboost_tps=turboboost_tps,
**kwargs,
).model_dump(exclude_none=True)

Expand Down
Loading