|
| 1 | +# --- |
| 2 | +# mypy: ignore-errors |
| 3 | +# --- |
| 4 | + |
| 5 | +# # Sticky routing for Modal HTTP Servers |
| 6 | + |
| 7 | +# This example demonstrates the usage and behavior of |
| 8 | +# the optional "sticky" routing behavior of |
| 9 | +# Modal HTTP Servers with a basic routing test. |
| 10 | + |
| 11 | +# For a gentler introduction to Modal HTTP Servers, |
| 12 | +# see [this example](https://modal.com/docs/examples/http_server). |
| 13 | +# For the use of Modal HTTP Servers for LLM inference, |
| 14 | +# see [this example](https://modal.com/docs/examples/sglang_low_latency). |
| 15 | + |
| 16 | +# In sticky routing, sequential requests from the same client |
| 17 | +# are sent to the same server replica. |
| 18 | +# Modal HTTP Servers offer sticky routing for fixed replica sets |
| 19 | +# using [rendezvous hashing](https://randorithms.com/2020/12/26/rendezvous-hashing.html), |
| 20 | +# ensuring that as your servers scale up and down, load stays balanced across replicas |
| 21 | +# and clients are typically routed to the same replica for repeated requests. |
| 22 | + |
| 23 | +# Note that requests are not _guaranteed_ to be routed to the same replica, |
| 24 | +# and so this form of sticky routing should not be relied on for logical correctness. |
| 25 | +# Instead, this sticky routing is intended to be used as a performance optimization, |
| 26 | +# as in KV cacheing for [Transformer LLM inference](https://modal.com/docs/examples/sglang_low_latency). |
| 27 | + |
| 28 | +# ## Define the Modal HTTP Server |
| 29 | + |
| 30 | +# First, we import the libraries we'll use both locally, to run a routing test, |
| 31 | +# and remotely, to run our server. |
| 32 | + |
| 33 | +# We also define our Modal [App](https://modal.com/docs/guide/apps) |
| 34 | +# and the Modal [Image](https://modal.com/docs/guide/images) |
| 35 | +# that provides the dependencies of our server code. |
| 36 | + |
| 37 | +import asyncio |
| 38 | +import time |
| 39 | +from dataclasses import dataclass |
| 40 | +from typing import Any |
| 41 | + |
| 42 | +import aiohttp |
| 43 | +import modal |
| 44 | +import modal.experimental |
| 45 | +from rich.console import Console |
| 46 | + |
| 47 | +app = modal.App("example-http-server-sticky") |
| 48 | + |
| 49 | +image = modal.Image.debian_slim().uv_pip_install("fastapi[standard]==0.115.4") |
| 50 | + |
| 51 | +# Now we can define our HTTP Server. |
| 52 | +# We set the minimum number of containers (replicas) |
| 53 | +# to be greater than one so that there are multiple |
| 54 | +# replicas available for routing during our test. |
| 55 | + |
| 56 | +# Additionally, we set the regions into which we |
| 57 | +# want to deploy the proxies that communicate between |
| 58 | +# our clients and the server. |
| 59 | + |
| 60 | +# We also use the [`modal.concurrent` decorator](https://modal.com/docs/guide/concurrent-inputs) |
| 61 | +# to allow each HTTP Server replica to handle more than one input. |
| 62 | + |
| 63 | +# Modal HTTP Servers are structured as Modal [Clses](https://modal.com/docs/guide/lifecycle-functions) |
| 64 | +# that start a process or thread that listens on the provided `port` in a `modal.enter`-decorated method. |
| 65 | +# Here, we spin up a simple FastAPI server that returns the |
| 66 | +# [identity of the replica within Modal](https://modal.com/docs/guide/environment_variables) |
| 67 | +# and run it with `uvicorn`. |
| 68 | + |
| 69 | +PORT = 8000 |
| 70 | +CONTAINERS = 2 |
| 71 | +PROXY_REGIONS = ["us-west"] |
| 72 | + |
| 73 | + |
| 74 | +@app.cls(image=image, min_containers=CONTAINERS) |
| 75 | +@modal.experimental.http_server(port=PORT, proxy_regions=PROXY_REGIONS) |
| 76 | +@modal.concurrent(target_inputs=100) |
| 77 | +class Server: |
| 78 | + @modal.enter() |
| 79 | + def start(self): |
| 80 | + import os |
| 81 | + import threading |
| 82 | + |
| 83 | + import uvicorn |
| 84 | + from fastapi import FastAPI |
| 85 | + |
| 86 | + container_id = os.environ["MODAL_TASK_ID"] |
| 87 | + fastapi_app = FastAPI(title=container_id) |
| 88 | + |
| 89 | + @fastapi_app.post("/") |
| 90 | + async def whoami(): |
| 91 | + return {"CONTAINER_ID": container_id} |
| 92 | + |
| 93 | + self.thread = threading.Thread( |
| 94 | + target=uvicorn.run, |
| 95 | + kwargs={"app": fastapi_app, "host": "0.0.0.0", "port": PORT}, |
| 96 | + daemon=True, |
| 97 | + ) |
| 98 | + self.thread.start() |
| 99 | + |
| 100 | + |
| 101 | +# ## Test the routing behavior of the Modal HTTP Server |
| 102 | + |
| 103 | +# Now we define our routing test, which will run locally |
| 104 | +# and interact with our Modal HTTP Server by sending requests. |
| 105 | + |
| 106 | +# It spins up some `n`umber of `client` tasks and repeatedly sends requests from each for some number of `seconds`. |
| 107 | +# The clients can be configured to use `sticky` routing or not (`--no-sticky`). |
| 108 | + |
| 109 | +# The test uses the `CONTAINER_ID`s returned by the HTTP Server |
| 110 | +# to track whether clients' requests are serviced by the same or different replicas. |
| 111 | +# It fails if the clients were configured to be sticky and any client |
| 112 | +# observes a different `CONTAINER_ID` on different requests. |
| 113 | +# So long as the set of containers does not change, |
| 114 | +# due to, for instance, replica failure or pre-emption, |
| 115 | +# this test should pass. |
| 116 | + |
| 117 | + |
| 118 | +@app.local_entrypoint() |
| 119 | +async def test(n_clients: int = 10, sticky: bool = True, seconds: float = 5.0): |
| 120 | + # wait for at least one replica to spin up |
| 121 | + url = (await Server._experimental_get_flash_urls.aio())[0] |
| 122 | + async with aiohttp.ClientSession() as sess: |
| 123 | + await wait_available(sess, url) |
| 124 | + |
| 125 | + # allow generous time for all replicas to spin up based on rough heuristic; |
| 126 | + # remove this sleep and increase CONTAINERS |
| 127 | + # to observe session routing changes during autoscaling |
| 128 | + await asyncio.sleep(5 + ((CONTAINERS - 10) // 2)) |
| 129 | + |
| 130 | + # run the test |
| 131 | + results = await run_clients(url, n_clients, seconds, sticky) |
| 132 | + stats = aggregate_results(results) |
| 133 | + |
| 134 | + # give time for server logs to flush, |
| 135 | + await asyncio.sleep(1) |
| 136 | + # then display results |
| 137 | + print_summary(url, sticky, n_clients, seconds, stats) |
| 138 | + |
| 139 | + if sticky and stats["multi"]: |
| 140 | + raise AssertionError("Sticky routing violated for some clients") |
| 141 | + |
| 142 | + |
| 143 | +# Because it is a Modal `local_entrypoint`, |
| 144 | +# this Python function automatically gets a CLI: |
| 145 | + |
| 146 | +# ```bash |
| 147 | +# modal run http_server_sticky.py --help |
| 148 | +# ``` |
| 149 | + |
| 150 | +# You can run the test with: |
| 151 | + |
| 152 | +# ```bash |
| 153 | +# modal run http_server_sticky.py |
| 154 | +# ``` |
| 155 | + |
| 156 | +# ## Write the client for the Modal HTTP Server |
| 157 | + |
| 158 | +# The code in this section implements some Modal HTTP Server-specific client logic. |
| 159 | + |
| 160 | +# First, clients of Modal HTTP Servers need to handle |
| 161 | +# [503 Service Unavailable](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/503) |
| 162 | +# error response status codes, which are returned whenever there are no live replicas. |
| 163 | + |
| 164 | +# In our case, we use them as a signal that at least one replica |
| 165 | +# is ready and so we can proceed with the test. |
| 166 | + |
| 167 | + |
| 168 | +async def wait_available(sess: aiohttp.ClientSession, url: str) -> None: |
| 169 | + while True: |
| 170 | + async with sess.post(url, json={}) as resp: |
| 171 | + if resp.status != 503: |
| 172 | + return |
| 173 | + |
| 174 | + |
| 175 | +# The full client logic appears in the function below. |
| 176 | +# Notably, it includes the header `Modal-Session-Id` |
| 177 | +# if clients are configured for sticky routing. |
| 178 | +# Here, we choose a simple small integer `client_id`. |
| 179 | + |
| 180 | +# The client collects information about which `CONTAINER_ID`s |
| 181 | +# it receives from the server and returns those in the form of |
| 182 | +# a simple `dataclass`. |
| 183 | + |
| 184 | + |
| 185 | +@dataclass |
| 186 | +class ClientResult: |
| 187 | + client_id: int |
| 188 | + containers_seen: set[str] |
| 189 | + requests_ok: int |
| 190 | + requests_err: int |
| 191 | + |
| 192 | + |
| 193 | +async def client( |
| 194 | + url: str, client_id: int, seconds: float, sticky: bool |
| 195 | +) -> ClientResult: |
| 196 | + headers = {"Modal-Session-Id": str(client_id)} if sticky else {} |
| 197 | + end = time.monotonic() + seconds |
| 198 | + |
| 199 | + seen: set[str] = set() |
| 200 | + n_ok: int = 0 |
| 201 | + n_err: int = 0 |
| 202 | + |
| 203 | + async with aiohttp.ClientSession(headers=headers) as sess: |
| 204 | + while time.monotonic() < end: |
| 205 | + try: |
| 206 | + async with sess.post( |
| 207 | + url, json={}, timeout=aiohttp.ClientTimeout(total=5) |
| 208 | + ) as resp: |
| 209 | + if resp.status == 200: |
| 210 | + data = await resp.json() |
| 211 | + seen.add(data["CONTAINER_ID"]) |
| 212 | + n_ok += 1 |
| 213 | + else: |
| 214 | + n_err += 1 |
| 215 | + except asyncio.TimeoutError: |
| 216 | + n_err += 1 |
| 217 | + |
| 218 | + return ClientResult(client_id, seen, n_ok, n_err) |
| 219 | + |
| 220 | + |
| 221 | +# ## Addenda |
| 222 | + |
| 223 | +# The remainder of this code is required for this example to run |
| 224 | +# but is not necessary for Modal HTTP Servers or their clients in general. |
| 225 | +# For instance, it defines the logic for concurrency and result aggregation/display |
| 226 | +# for this particular routing test. |
| 227 | + |
| 228 | + |
| 229 | +async def run_clients( |
| 230 | + url: str, n_clients: int, seconds: float, sticky: bool |
| 231 | +) -> list[ClientResult]: |
| 232 | + tasks = [client(url, c, seconds, sticky) for c in range(n_clients)] |
| 233 | + return list(await asyncio.gather(*tasks)) |
| 234 | + |
| 235 | + |
| 236 | +def aggregate_results(results: list[ClientResult]) -> dict[str, Any]: |
| 237 | + total_ok = sum(r.requests_ok for r in results) |
| 238 | + total_err = sum(r.requests_err for r in results) |
| 239 | + multi = { |
| 240 | + r.client_id: r.containers_seen for r in results if len(r.containers_seen) > 1 |
| 241 | + } |
| 242 | + |
| 243 | + per_client = [(r.client_id, r.containers_seen) for r in results] |
| 244 | + |
| 245 | + return { |
| 246 | + "total_ok": total_ok, |
| 247 | + "total_err": total_err, |
| 248 | + "multi": multi, |
| 249 | + "per_client": per_client, |
| 250 | + } |
| 251 | + |
| 252 | + |
| 253 | +def print_summary( |
| 254 | + url: str, |
| 255 | + sticky: bool, |
| 256 | + n_clients: int, |
| 257 | + seconds: float, |
| 258 | + stats: dict[str, Any], |
| 259 | + console: Console | None = None, |
| 260 | +) -> None: |
| 261 | + if not console: |
| 262 | + console = Console() |
| 263 | + console.print() |
| 264 | + console.print( |
| 265 | + f"[bold]url=[/]{url} [bold]sticky=[/]{sticky} [bold]clients=[/]{n_clients} [bold]duration_s=[/]{seconds}" |
| 266 | + ) |
| 267 | + console.print( |
| 268 | + f"[green]total_ok={stats['total_ok']}[/] [red]total_err={stats['total_err']}[/]" |
| 269 | + ) |
| 270 | + |
| 271 | + for c, seen in stats["per_client"]: |
| 272 | + console.print(f" client={c} containers={list(seen)}") |
| 273 | + console.print( |
| 274 | + f"Clients with multiple containers: [yellow]{len(stats['multi'])}/{n_clients}[/]" |
| 275 | + ) |
0 commit comments