release: 2.0.0-alpha.17 (#217)

stainless-app[bot] · blainekasten · web-flow · commit 2d7b83549e0d · 2026-01-21T17:20:46.000-06:00
* feat: Improve usage of models list cli command (#216) * feat: Improve usage of models list cli command * bring back context length and improve pricing field * cleanup * feat(cli): add b200 and h200 GPU options for endpoint creation (#218) * chore: Deprecate CLI usage for endpoints create flag --no-promopt-cache (#219) * chore: Mark disable_prompt_cache as deprecated for endpoint creation * release: 2.0.0-alpha.17 --------- Co-authored-by: Blaine Kasten <blainekasten@gmail.com> Co-authored-by: stainless-app[bot] <142633134+stainless-app[bot]@users.noreply.github.com>
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "2.0.0-alpha.16"
+  ".": "2.0.0-alpha.17"
 }
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 55
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2Ftogetherai-585412004e70865cc6e32fdda4177eabcffd0f165e485da320bad9514960ebe3.yml
-openapi_spec_hash: 70b0de2b3a0eaa3dca00722ba76d5e54
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2Ftogetherai-cea384db9edf6057ebc5c198a155955b97771430e7afe3be910842c734bb9812.yml
+openapi_spec_hash: 9c2c575baec9b59add4b2e91c14089ad
 config_hash: a955366d5f659d70d9e6b26116e119bf
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,20 @@
 # Changelog
 
+## 2.0.0-alpha.17 (2026-01-21)
+
+Full Changelog: [v2.0.0-alpha.16...v2.0.0-alpha.17](https://github.com/togethercomputer/together-py/compare/v2.0.0-alpha.16...v2.0.0-alpha.17)
+
+### Features
+
+* **cli:** add b200 and h200 GPU options for endpoint creation ([#218](https://github.com/togethercomputer/together-py/issues/218)) ([b514912](https://github.com/togethercomputer/together-py/commit/b514912a281922fefbf8a9f62b936ed1de243718))
+* Improve usage of models list cli command ([#216](https://github.com/togethercomputer/together-py/issues/216)) ([430e6c1](https://github.com/togethercomputer/together-py/commit/430e6c1e030749be474f020b677d91014ba4482c))
+
+
+### Chores
+
+* Deprecate CLI usage for endpoints create flag --no-promopt-cache ([#219](https://github.com/togethercomputer/together-py/issues/219)) ([55e9700](https://github.com/togethercomputer/together-py/commit/55e9700187b42f8baff6f567a3a657b46577ed88))
+* Mark disable_prompt_cache as deprecated for endpoint creation ([6a629b2](https://github.com/togethercomputer/together-py/commit/6a629b29e53b4374503d30ca75456184ef313b67))
+
 ## 2.0.0-alpha.16 (2026-01-18)
 
 Full Changelog: [v2.0.0-alpha.15...v2.0.0-alpha.16](https://github.com/togethercomputer/together-py/compare/v2.0.0-alpha.15...v2.0.0-alpha.16)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "together"
-version = "2.0.0-alpha.16"
+version = "2.0.0-alpha.17"
 description = "The official Python library for the together API"
 dynamic = ["readme"]
 license = "Apache-2.0"
diff --git a/src/together/_version.py b/src/together/_version.py
@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "together"
-__version__ = "2.0.0-alpha.16"  # x-release-please-version
+__version__ = "2.0.0-alpha.17"  # x-release-please-version
diff --git a/src/together/lib/cli/api/endpoints/create.py b/src/together/lib/cli/api/endpoints/create.py
@@ -31,7 +31,7 @@
 )
 @click.option(
     "--gpu",
-    type=click.Choice(["h100", "a100", "l40", "l40s", "rtx-6000"]),
+    type=click.Choice(["b200", "h200", "h100", "a100", "l40", "l40s", "rtx-6000"]),
     required=True,
     help="GPU type to use for inference",
 )
@@ -48,7 +48,7 @@
 @click.option(
     "--no-prompt-cache",
     is_flag=True,
-    help="Disable the prompt cache for this endpoint",
+    help="Deprecated and no longer has any effect.",
 )
 @click.option(
     "--no-speculative-decoding",
@@ -95,13 +95,18 @@ def create(
     client: Together = ctx.obj
     # Map GPU types to their full hardware ID names
     gpu_map = {
+        "b200": "nvidia_b200_180gb_sxm",
+        "h200": "nvidia_h200_140gb_sxm",
         "h100": "nvidia_h100_80gb_sxm",
         "a100": "nvidia_a100_80gb_pcie" if gpu_count == 1 else "nvidia_a100_80gb_sxm",
         "l40": "nvidia_l40",
         "l40s": "nvidia_l40s",
         "rtx-6000": "nvidia_rtx_6000_ada",
     }
 
+    if no_prompt_cache is not None:
+        click.echo("Warning: --no-prompt-cache is deprecated and no longer has any effect.", err=True)
+
     hardware_id = f"{gpu_count}x_{gpu_map[gpu]}"
 
     try:
@@ -113,7 +118,6 @@ def create(
                 "max_replicas": max_replicas,
             },
             display_name=display_name or omit,
-            disable_prompt_cache=no_prompt_cache or omit,
             disable_speculative_decoding=no_speculative_decoding or omit,
             state="STOPPED" if no_auto_start else "STARTED",
             inactive_timeout=inactive_timeout,
@@ -134,8 +138,6 @@ def create(
     click.echo(f"  Hardware: {hardware_id}", err=True)
     if display_name:
         click.echo(f"  Display name: {display_name}", err=True)
-    if no_prompt_cache:
-        click.echo("  Prompt cache: disabled", err=True)
     if no_speculative_decoding:
         click.echo("  Speculative decoding: disabled", err=True)
     if no_auto_start:
diff --git a/src/together/lib/cli/api/models/list.py b/src/together/lib/cli/api/models/list.py
@@ -5,9 +5,9 @@
 from tabulate import tabulate
 
 from together import Together, omit
-from together._models import BaseModel
 from together._response import APIResponse as APIResponse
 from together.lib.cli.api._utils import handle_api_errors
+from together.lib.utils.serializer import datetime_serializer
 
 
 @click.command()
@@ -29,23 +29,27 @@ def list(ctx: click.Context, type: Optional[str], json: bool) -> None:
 
     models_list = client.models.list(dedicated=type == "dedicated" if type else omit)
 
+    if json:
+        items = [model.model_dump() for model in models_list]
+        click.echo(json_lib.dumps(items, indent=2, default=datetime_serializer))
+        return
+
     display_list: List[Dict[str, Any]] = []
-    model: BaseModel
-    for model in models_list:
+    for model in sorted(models_list, key=lambda x: x.type):
+        price_parts: List[str] = []
+
+        # Only show pricing if a value actually exists
+        if model.pricing and model.pricing.input > 0 and model.pricing.output > 0:
+            price_parts.append(f"${model.pricing.input:.2f}")
+            price_parts.append(f"${model.pricing.output:.2f}")
+
         display_list.append(
             {
-                "ID": model.id,
-                "Name": model.display_name,
-                "Organization": model.organization,
+                "Model": model.id,
                 "Type": model.type,
-                "Context Length": model.context_length,
-                "License": model.license,
-                "Input per 1M token": model.pricing.input if model.pricing else None,
-                "Output per 1M token": model.pricing.output if model.pricing else None,
+                "Context length": model.context_length if model.context_length else None,
+                "Price per 1M Tokens (input/output)": "/".join(price_parts),
             }
         )
 
-    if json:
-        click.echo(json_lib.dumps(display_list, indent=2))
-    else:
-        click.echo(tabulate(display_list, headers="keys", tablefmt="plain"))
+    click.echo(tabulate(display_list, headers="keys"))
diff --git a/src/together/resources/endpoints.py b/src/together/resources/endpoints.py
@@ -81,7 +81,7 @@ def create(
 
           availability_zone: Create the endpoint in a specified availability zone (e.g., us-central-4b)
 
-          disable_prompt_cache: Whether to disable the prompt cache for this endpoint
+          disable_prompt_cache: This parameter is deprecated and no longer has any effect.
 
           disable_speculative_decoding: Whether to disable speculative decoding for this endpoint
 
@@ -375,7 +375,7 @@ async def create(
 
           availability_zone: Create the endpoint in a specified availability zone (e.g., us-central-4b)
 
-          disable_prompt_cache: Whether to disable the prompt cache for this endpoint
+          disable_prompt_cache: This parameter is deprecated and no longer has any effect.
 
           disable_speculative_decoding: Whether to disable speculative decoding for this endpoint
 
diff --git a/src/together/types/endpoint_create_params.py b/src/together/types/endpoint_create_params.py
@@ -24,7 +24,7 @@ class EndpointCreateParams(TypedDict, total=False):
     """Create the endpoint in a specified availability zone (e.g., us-central-4b)"""
 
     disable_prompt_cache: bool
-    """Whether to disable the prompt cache for this endpoint"""
+    """This parameter is deprecated and no longer has any effect."""
 
     disable_speculative_decoding: bool
     """Whether to disable speculative decoding for this endpoint"""

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- ".": "2.0.0-alpha.16"`
	`2`	`+ ".": "2.0.0-alpha.17"`
`3`	`3`	`}`