Skip to content

Commit 7ebc4f1

Browse files
authored
http server sticky (#1512)
* commit WIP * minor text fixes * fix lint not raised by latest ruff version * Add mypy ignore-errors header * Reduce default clients to 2 for CI stability * Handle request timeouts gracefully in sticky routing test * Restore default clients to 10
1 parent 41da872 commit 7ebc4f1

File tree

1 file changed

+275
-0
lines changed

1 file changed

+275
-0
lines changed
Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
# ---
2+
# mypy: ignore-errors
3+
# ---
4+
5+
# # Sticky routing for Modal HTTP Servers
6+
7+
# This example demonstrates the usage and behavior of
8+
# the optional "sticky" routing behavior of
9+
# Modal HTTP Servers with a basic routing test.
10+
11+
# For a gentler introduction to Modal HTTP Servers,
12+
# see [this example](https://modal.com/docs/examples/http_server).
13+
# For the use of Modal HTTP Servers for LLM inference,
14+
# see [this example](https://modal.com/docs/examples/sglang_low_latency).
15+
16+
# In sticky routing, sequential requests from the same client
17+
# are sent to the same server replica.
18+
# Modal HTTP Servers offer sticky routing for fixed replica sets
19+
# using [rendezvous hashing](https://randorithms.com/2020/12/26/rendezvous-hashing.html),
20+
# ensuring that as your servers scale up and down, load stays balanced across replicas
21+
# and clients are typically routed to the same replica for repeated requests.
22+
23+
# Note that requests are not _guaranteed_ to be routed to the same replica,
24+
# and so this form of sticky routing should not be relied on for logical correctness.
25+
# Instead, this sticky routing is intended to be used as a performance optimization,
26+
# as in KV cacheing for [Transformer LLM inference](https://modal.com/docs/examples/sglang_low_latency).
27+
28+
# ## Define the Modal HTTP Server
29+
30+
# First, we import the libraries we'll use both locally, to run a routing test,
31+
# and remotely, to run our server.
32+
33+
# We also define our Modal [App](https://modal.com/docs/guide/apps)
34+
# and the Modal [Image](https://modal.com/docs/guide/images)
35+
# that provides the dependencies of our server code.
36+
37+
import asyncio
38+
import time
39+
from dataclasses import dataclass
40+
from typing import Any
41+
42+
import aiohttp
43+
import modal
44+
import modal.experimental
45+
from rich.console import Console
46+
47+
app = modal.App("example-http-server-sticky")
48+
49+
image = modal.Image.debian_slim().uv_pip_install("fastapi[standard]==0.115.4")
50+
51+
# Now we can define our HTTP Server.
52+
# We set the minimum number of containers (replicas)
53+
# to be greater than one so that there are multiple
54+
# replicas available for routing during our test.
55+
56+
# Additionally, we set the regions into which we
57+
# want to deploy the proxies that communicate between
58+
# our clients and the server.
59+
60+
# We also use the [`modal.concurrent` decorator](https://modal.com/docs/guide/concurrent-inputs)
61+
# to allow each HTTP Server replica to handle more than one input.
62+
63+
# Modal HTTP Servers are structured as Modal [Clses](https://modal.com/docs/guide/lifecycle-functions)
64+
# that start a process or thread that listens on the provided `port` in a `modal.enter`-decorated method.
65+
# Here, we spin up a simple FastAPI server that returns the
66+
# [identity of the replica within Modal](https://modal.com/docs/guide/environment_variables)
67+
# and run it with `uvicorn`.
68+
69+
PORT = 8000
70+
CONTAINERS = 2
71+
PROXY_REGIONS = ["us-west"]
72+
73+
74+
@app.cls(image=image, min_containers=CONTAINERS)
75+
@modal.experimental.http_server(port=PORT, proxy_regions=PROXY_REGIONS)
76+
@modal.concurrent(target_inputs=100)
77+
class Server:
78+
@modal.enter()
79+
def start(self):
80+
import os
81+
import threading
82+
83+
import uvicorn
84+
from fastapi import FastAPI
85+
86+
container_id = os.environ["MODAL_TASK_ID"]
87+
fastapi_app = FastAPI(title=container_id)
88+
89+
@fastapi_app.post("/")
90+
async def whoami():
91+
return {"CONTAINER_ID": container_id}
92+
93+
self.thread = threading.Thread(
94+
target=uvicorn.run,
95+
kwargs={"app": fastapi_app, "host": "0.0.0.0", "port": PORT},
96+
daemon=True,
97+
)
98+
self.thread.start()
99+
100+
101+
# ## Test the routing behavior of the Modal HTTP Server
102+
103+
# Now we define our routing test, which will run locally
104+
# and interact with our Modal HTTP Server by sending requests.
105+
106+
# It spins up some `n`umber of `client` tasks and repeatedly sends requests from each for some number of `seconds`.
107+
# The clients can be configured to use `sticky` routing or not (`--no-sticky`).
108+
109+
# The test uses the `CONTAINER_ID`s returned by the HTTP Server
110+
# to track whether clients' requests are serviced by the same or different replicas.
111+
# It fails if the clients were configured to be sticky and any client
112+
# observes a different `CONTAINER_ID` on different requests.
113+
# So long as the set of containers does not change,
114+
# due to, for instance, replica failure or pre-emption,
115+
# this test should pass.
116+
117+
118+
@app.local_entrypoint()
119+
async def test(n_clients: int = 10, sticky: bool = True, seconds: float = 5.0):
120+
# wait for at least one replica to spin up
121+
url = (await Server._experimental_get_flash_urls.aio())[0]
122+
async with aiohttp.ClientSession() as sess:
123+
await wait_available(sess, url)
124+
125+
# allow generous time for all replicas to spin up based on rough heuristic;
126+
# remove this sleep and increase CONTAINERS
127+
# to observe session routing changes during autoscaling
128+
await asyncio.sleep(5 + ((CONTAINERS - 10) // 2))
129+
130+
# run the test
131+
results = await run_clients(url, n_clients, seconds, sticky)
132+
stats = aggregate_results(results)
133+
134+
# give time for server logs to flush,
135+
await asyncio.sleep(1)
136+
# then display results
137+
print_summary(url, sticky, n_clients, seconds, stats)
138+
139+
if sticky and stats["multi"]:
140+
raise AssertionError("Sticky routing violated for some clients")
141+
142+
143+
# Because it is a Modal `local_entrypoint`,
144+
# this Python function automatically gets a CLI:
145+
146+
# ```bash
147+
# modal run http_server_sticky.py --help
148+
# ```
149+
150+
# You can run the test with:
151+
152+
# ```bash
153+
# modal run http_server_sticky.py
154+
# ```
155+
156+
# ## Write the client for the Modal HTTP Server
157+
158+
# The code in this section implements some Modal HTTP Server-specific client logic.
159+
160+
# First, clients of Modal HTTP Servers need to handle
161+
# [503 Service Unavailable](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/503)
162+
# error response status codes, which are returned whenever there are no live replicas.
163+
164+
# In our case, we use them as a signal that at least one replica
165+
# is ready and so we can proceed with the test.
166+
167+
168+
async def wait_available(sess: aiohttp.ClientSession, url: str) -> None:
169+
while True:
170+
async with sess.post(url, json={}) as resp:
171+
if resp.status != 503:
172+
return
173+
174+
175+
# The full client logic appears in the function below.
176+
# Notably, it includes the header `Modal-Session-Id`
177+
# if clients are configured for sticky routing.
178+
# Here, we choose a simple small integer `client_id`.
179+
180+
# The client collects information about which `CONTAINER_ID`s
181+
# it receives from the server and returns those in the form of
182+
# a simple `dataclass`.
183+
184+
185+
@dataclass
186+
class ClientResult:
187+
client_id: int
188+
containers_seen: set[str]
189+
requests_ok: int
190+
requests_err: int
191+
192+
193+
async def client(
194+
url: str, client_id: int, seconds: float, sticky: bool
195+
) -> ClientResult:
196+
headers = {"Modal-Session-Id": str(client_id)} if sticky else {}
197+
end = time.monotonic() + seconds
198+
199+
seen: set[str] = set()
200+
n_ok: int = 0
201+
n_err: int = 0
202+
203+
async with aiohttp.ClientSession(headers=headers) as sess:
204+
while time.monotonic() < end:
205+
try:
206+
async with sess.post(
207+
url, json={}, timeout=aiohttp.ClientTimeout(total=5)
208+
) as resp:
209+
if resp.status == 200:
210+
data = await resp.json()
211+
seen.add(data["CONTAINER_ID"])
212+
n_ok += 1
213+
else:
214+
n_err += 1
215+
except asyncio.TimeoutError:
216+
n_err += 1
217+
218+
return ClientResult(client_id, seen, n_ok, n_err)
219+
220+
221+
# ## Addenda
222+
223+
# The remainder of this code is required for this example to run
224+
# but is not necessary for Modal HTTP Servers or their clients in general.
225+
# For instance, it defines the logic for concurrency and result aggregation/display
226+
# for this particular routing test.
227+
228+
229+
async def run_clients(
230+
url: str, n_clients: int, seconds: float, sticky: bool
231+
) -> list[ClientResult]:
232+
tasks = [client(url, c, seconds, sticky) for c in range(n_clients)]
233+
return list(await asyncio.gather(*tasks))
234+
235+
236+
def aggregate_results(results: list[ClientResult]) -> dict[str, Any]:
237+
total_ok = sum(r.requests_ok for r in results)
238+
total_err = sum(r.requests_err for r in results)
239+
multi = {
240+
r.client_id: r.containers_seen for r in results if len(r.containers_seen) > 1
241+
}
242+
243+
per_client = [(r.client_id, r.containers_seen) for r in results]
244+
245+
return {
246+
"total_ok": total_ok,
247+
"total_err": total_err,
248+
"multi": multi,
249+
"per_client": per_client,
250+
}
251+
252+
253+
def print_summary(
254+
url: str,
255+
sticky: bool,
256+
n_clients: int,
257+
seconds: float,
258+
stats: dict[str, Any],
259+
console: Console | None = None,
260+
) -> None:
261+
if not console:
262+
console = Console()
263+
console.print()
264+
console.print(
265+
f"[bold]url=[/]{url} [bold]sticky=[/]{sticky} [bold]clients=[/]{n_clients} [bold]duration_s=[/]{seconds}"
266+
)
267+
console.print(
268+
f"[green]total_ok={stats['total_ok']}[/] [red]total_err={stats['total_err']}[/]"
269+
)
270+
271+
for c, seen in stats["per_client"]:
272+
console.print(f" client={c} containers={list(seen)}")
273+
console.print(
274+
f"Clients with multiple containers: [yellow]{len(stats['multi'])}/{n_clients}[/]"
275+
)

0 commit comments

Comments
 (0)