diff --git a/06_gpu_and_ml/llm-serving/lfm_snapshot.py b/06_gpu_and_ml/llm-serving/lfm_snapshot.py
new file mode 100644
index 000000000..165afac26
--- /dev/null
+++ b/06_gpu_and_ml/llm-serving/lfm_snapshot.py
@@ -0,0 +1,520 @@
+# ---
+# deploy: true
+# cmd: ["python", "06_gpu_and_ml/llm-serving/lfm_snapshot.py"]
+# ---
+
+# # Low Latency, Serverless LFM 2 with vLLM and Modal
+
+# In this example, we show how to serve Liquid AI's [LFM 2 models](https://www.liquid.ai/liquid-foundation-models)
+# with [vLLM](https://docs.vllm.ai) with low latency and fast cold starts on Modal.
+
+# The LFM 2 models are not vanilla Transformers -- they have a hybrid architecture,
+# discovered via an architecture search that optimized for quality, latency, and memory footprint.
+# Check out their [technical report](https://arxiv.org/abs/2511.23404v1)
+# for more details.
+
+# This example demonstrates techniques to run inference at high efficiency,
+# including advanced features of both vLLM and Modal.
+# For a simpler introduction to LLM serving, see
+# [this example](https://modal.com/docs/examples/llm_inference).
+
+# To minimize routing overheads, we use `@modal.experimental.http_server`,
+# which uses a new, low-latency routing service on Modal designed for latency-sensitive inference workloads.
+# This gives us more control over routing, but with increased power comes increased responsibility.
+
+# We also include instructions for cutting cold start times by an order of magnitude using Modal's
+# [CPU + GPU memory snapshots](https://modal.com/docs/guide/memory-snapshot).
+
+# Fast cold starts are particularly useful for LLM inference applications
+# that have highly "bursty" workloads, like document processing.
+# See [this guide](https://modal.com/docs/guide/high-performance-llm-inference)
+# for a breakdown of different LLM inference workloads and how to optimize them.
+
+# ## Set up the container image
+
+# Our first order of business is to define the environment our server will run in:
+# the [container `Image`](https://modal.com/docs/guide/images).
+# We'll use the [vLLM inference server](https://docs.vllm.ai).
+
+# While we're at it, we import the dependencies we'll need both remotely and locally (for deployment).
+
+import asyncio
+import json
+import os
+import subprocess
+import time
+
+import aiohttp
+import modal
+import modal.experimental
+
+MINUTES = 60
+
+MODEL_NAME = os.environ.get("MODEL_NAME", "LiquidAI/LFM2-8B-A1B")
+print(f"Running deployment script for model: {MODEL_NAME}")
+
+vllm_image = (
+    modal.Image.from_registry("vllm/vllm-openai:v0.15.1")
+    .entrypoint([])
+    .run_commands("ln -s $(which python3) /usr/bin/python")
+    .pip_install("transformers==5.1.0")
+    .env(
+        {
+            "HF_HUB_CACHE": "/root/.cache/huggingface",
+            "HF_XET_HIGH_PERFORMANCE": "1",
+            "VLLM_SERVER_DEV_MODE": "1",
+            "TORCH_CPP_LOG_LEVEL": "FATAL",
+            "MODEL_NAME": MODEL_NAME,
+        }
+    )
+)
+
+# ### Selecting the GPU
+
+# We choose the [H100 GPU](https://modal.com/blog/introducing-h100),
+# which offers excellent price-performance and has sufficient VRAM to store the models.
+
+N_GPU = 1
+GPU = "H100"
+
+# ### Loading and caching the model weights
+
+# We don't want to load the model from the Hub every time we start the server.
+# We can load it much faster from a [Modal Volume](https://modal.com/docs/guide/volumes).
+# Typical speeds are around one to two GB/s.
+
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+
+# In addition to pointing the Hugging Face Hub at the path
+# where we mount the Volume, we also
+# [turn on "high performance" downloads](https://huggingface.co/docs/hub/en/models-downloading#faster-downloads),
+# which can fully saturate our network bandwidth.
+
+# ### Caching compilation artifacts
+
+# Model weights aren't the only thing we want to cache.
+# vLLM also produces compilation artifacts that we want to persist across restarts.
+
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+
+# ## Define the inference server and infrastructure
+
+# ### Selecting infrastructure to minimize latency
+
+# Minimizing latency requires geographic co-location of clients and servers.
+
+# So for low latency LLM inference services on Modal, you must select a
+# [cloud region](https://modal.com/docs/guide/region-selection)
+# for both the GPU-accelerated containers running inference
+# and for the internal Modal proxies that forward requests to them
+# as part of defining a `modal.experimental.http_server`.
+
+# Here, we assume users are mostly in the northern half of the Americas
+# and select the `us-east` cloud region to serve them.
+# This should result in at most a few dozen milliseconds of round-trip time.
+
+REGION = "us-east"
+
+# For production-scale LLM inference services, there are generally
+# enough requests to justify keeping at least one replica running at all times.
+# Having a "warm" or "live" replica reduces latency by skipping slow initialization work
+# that occurs when new replica boots up (a ["cold start"](https://modal.com/docs/guide/cold-start)).
+# For LLM inference servers, that latency runs from seconds to minutes.
+
+# However, since this is documentation code, we'll set the `min_containers` of our Modal Function
+# to `0` to avoid surprise bills during casual use.
+
+MIN_CONTAINERS = 0
+
+# Finally, we need to decide how we will scale up and down replicas
+# in response to load. Without autoscaling, users' requests will queue
+# when the server becomes overloaded. Even apart from queueing, responses
+# generally become slower per user above a certain minimum number of
+# concurrent requests.
+
+# So we set a target for the number of inputs to run on a single container
+# with [`modal.concurrent`](https://modal.com/docs/reference/modal.concurrent).
+# For details, see [the guide](https://modal.com/docs/guide/concurrent-inputs).
+
+# Generally, this choice needs to be made as part of
+# [LLM inference engine benchmarking](https://modal.com/llm-almanac/how-to-benchmark).
+
+TARGET_INPUTS = 32
+MAX_INPUTS = 100
+
+# ## Speed up cold starts with GPU snapshotting
+
+# Modal is a serverless compute platform, so all of your
+# inference services automatically scale up and down to handle
+# variable load.
+
+# Scaling up a new replica requires quite a bit of work --
+# loading up Python and system packages, loading model weights,
+# setting up the inference engine, and so on.
+
+# We can skip over and speed up a bunch of this work
+# when spinning up new replicas after the first
+# by directly booting from a [memory snapshot](https://modal.com/docs/guide/memory-snapshot),
+# which contains the exact in-memory representation of our server just before it begins taking requests.
+
+# Most applications can be snapshot and experience substantial speedups (2x to 10x,
+# see [our initial benchmarks here](https://modal.com/blog/gpu-mem-snapshots)).
+# However, it generally requires some extra work to adapt the application code.
+
+# vLLM supports a sleep mode that allows us to leverage Modal's
+# [CPU + GPU memory snapshots](https://modal.com/docs/guide/memory-snapshot)
+# for dramatically faster cold starts.
+
+# When `enable_memory_snapshot=True` and `experimental_options={"enable_gpu_snapshot": True}`
+# are set on the class, Modal captures both CPU and GPU memory state.
+# The `@modal.enter(snap=True)` method runs before the snapshot is taken:
+# we start vLLM, wait for it to be ready, warm it up, then put it to sleep.
+# The `@modal.enter(snap=False)` method runs after restoring from snapshot:
+# we wake vLLM back up so it can serve requests immediately.
+
+# ### Sleeping and waking a vLLM server
+
+# We prepare our vLLM inference server for snapshotting by first sending
+# a few requests to "warm it up", ensuring that it is fully ready to process requests.
+# Then we "put it to sleep", moving non-essential data out of GPU memory,
+# with a request to `/sleep`. At this point, we can take a memory snapshot.
+# Upon snapshot restoration, we "wake up" the server with a request to `/wake_up`.
+
+# We use the [`requests` library](https://requests.readthedocs.io/en/latest/)
+# to send ourselves these HTTP requests on
+# [`localhost`/`127.0.0.1`](https://superuser.com/questions/31824/why-is-localhost-ip-127-0-0-1).
+
+VLLM_PORT = 8000
+
+with vllm_image.imports():
+    import requests
+
+
+def wait_ready(process: subprocess.Popen, timeout: int = 15 * MINUTES):
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        try:
+            check_running(process)
+            requests.get(f"http://127.0.0.1:{VLLM_PORT}/health").raise_for_status()
+            return
+        except (
+            subprocess.CalledProcessError,
+            requests.exceptions.ConnectionError,
+            requests.exceptions.HTTPError,
+        ):
+            time.sleep(5)
+    raise TimeoutError(f"vLLM server not ready within {timeout} seconds")
+
+
+def check_running(p: subprocess.Popen):
+    if (rc := p.poll()) is not None:
+        raise subprocess.CalledProcessError(rc, cmd=p.args)
+
+
+def warmup():
+    payload = {
+        "model": "llm",
+        "messages": [{"role": "user", "content": "Hello, how are you?"}],
+        "max_tokens": 16,
+    }
+    for _ in range(3):
+        requests.post(
+            f"http://127.0.0.1:{VLLM_PORT}/v1/chat/completions",
+            json=payload,
+            timeout=60,
+        ).raise_for_status()
+
+
+def sleep(level: int = 1):
+    requests.post(
+        f"http://127.0.0.1:{VLLM_PORT}/sleep?level={level}"
+    ).raise_for_status()
+
+
+def wake_up():
+    requests.post(f"http://127.0.0.1:{VLLM_PORT}/wake_up").raise_for_status()
+
+
+# ### Controlling container lifecycles with `modal.Cls`
+
+# We wrap up all of the choices we made about the infrastructure
+# of our inference server into a number of Python decorators
+# that we apply to a Python class that encapsulates the logic
+# to run our server.
+
+# The key decorators are:
+
+# - [`@app.cls`](https://modal.com/docs/guide/lifecycle-functions) to define the core of our service.
+# We attach our Image, request a GPU, attach our cache Volumes, specify the region, and configure auto-scaling.
+# See [the reference documentation](https://modal.com/docs/reference/modal.App#cls) for details.
+
+# - `@modal.experimental.http_server` to turn our Python code into an HTTP server
+# (i.e. fronting all of our containers with a proxy with a URL). The wrapped code
+# needs to eventually listen for HTTP connections on the provided `port`.
+
+# - [`@modal.concurrent`](https://modal.com/docs/guide/concurrent-inputs) to specify how many
+# requests our server can handle before we need to scale up.
+
+# - [`@modal.enter` and `@modal.exit`](https://modal.com/docs/guide/lifecycle-functions) to indicate
+# which methods of the class should be run when starting the server and shutting it down.
+# The `snap=True`/`snap=False` distinction controls which methods run before/after a memory snapshot.
+
+# Modal considers a new replica ready to receive inputs once the `modal.enter` methods have exited
+# and the container accepts connections.
+
+# With all this in place, we are ready to define our high-performance, low-latency
+# LFM 2 inference server.
+
+app = modal.App("examples-lfm-snapshot")
+
+
+@app.cls(
+    image=vllm_image,
+    gpu=GPU,
+    scaledown_window=5 * MINUTES,
+    timeout=15 * MINUTES,
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+    secrets=[modal.Secret.from_name("huggingface-secret-liquid")],
+    enable_memory_snapshot=True,
+    experimental_options={"enable_gpu_snapshot": True},
+    region=REGION,
+    min_containers=MIN_CONTAINERS,
+)
+@modal.experimental.http_server(
+    port=VLLM_PORT,
+    proxy_regions=[REGION],
+    exit_grace_period=5,
+)
+@modal.concurrent(target_inputs=TARGET_INPUTS)
+class LfmVllmInference:
+    @modal.enter(snap=True)
+    def startup(self):
+        """Start the vLLM server and block until it is healthy, then warm it up and put it to sleep."""
+        cmd = [
+            "vllm",
+            "serve",
+            MODEL_NAME,
+            "--served-model-name",
+            MODEL_NAME,
+            "--served-model-name",
+            "llm",
+            "--host",
+            "0.0.0.0",
+            "--port",
+            f"{VLLM_PORT}",
+            "--dtype",
+            "bfloat16",
+            "--gpu-memory-utilization",
+            "0.8",
+            "--max-num-seqs",
+            f"{MAX_INPUTS}",
+            "--max-cudagraph-capture-size",
+            f"{MAX_INPUTS}",
+            "--enable-sleep-mode",
+        ]
+
+        print(*cmd)
+        self.process = subprocess.Popen(cmd)
+        wait_ready(self.process)
+        warmup()
+        sleep(1)
+
+    @modal.enter(snap=False)
+    def restore(self):
+        """Wake vLLM from sleep mode after restoring from a memory snapshot."""
+        wake_up()
+
+    @modal.exit()
+    def stop(self):
+        self.process.terminate()
+
+
+# ## Deploy the server
+
+# To deploy the server on Modal, just run
+
+# ```bash
+# modal deploy lfm_snapshot.py
+# ```
+
+# This will create a new App on Modal and build the container image for it if it hasn't been built yet.
+
+# ## Interact with the server
+
+# Once it is deployed, you'll see a URL appear in the command line,
+# something like `https://your-workspace-name--examples-lfm-snapshot-lfmvllminference.us-east.modal.direct`.
+
+# You can find [interactive Swagger UI docs](https://swagger.io/tools/swagger-ui/)
+# at the `/docs` route of that URL, i.e. `https://your-workspace-name--examples-lfm-snapshot-lfmvllminference.us-east.modal.direct/docs`.
+# These docs describe each route and indicate the expected input and output
+# and translate requests into `curl` commands.
+# For simple routes, you can even send a request directly from the docs page.
+
+# Note: when no replicas are available, Modal will respond with
+# the [503 Service Unavailable status](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/503).
+# In your browser, you can just hit refresh until the docs page appears.
+# You can see the status of the application and its containers on your [Modal dashboard](https://modal.com/apps).
+
+# ## Test the server
+
+# To make it easier to test the server setup, we also include a `local_entrypoint`
+# that hits the server with a simple client.
+
+# If you execute the command
+
+# ```bash
+# modal run lfm_snapshot.py
+# ```
+
+# a fresh replica of the server will be spun up on Modal while
+# the code below executes on your local machine.
+
+# Think of this like writing simple tests inside of the `if __name__ == "__main__"`
+# block of a Python script, but for cloud deployments!
+
+
+@app.local_entrypoint()
+async def test(test_timeout=10 * MINUTES, prompt=None, twice=True):
+    url = LfmVllmInference._experimental_get_flash_urls()[0]
+
+    if prompt is None:
+        prompt = "Count to 1000, slowly."
+
+    messages = [
+        {"role": "user", "content": prompt},
+    ]
+
+    await probe(url, messages, timeout=test_timeout)
+    if twice:
+        messages = [{"role": "user", "content": "Tell me a joke."}]
+        print(f"Sending messages to {url}:", *messages, sep="\n\t")
+        await probe(url, messages, timeout=1 * MINUTES)
+
+
+# This test relies on the `probe` helper function below,
+# which ping the server and wait for a valid response to stream.
+
+# The `probe` helper function specifically ignores
+# two types of errors that can occur while a replica
+# is starting up -- timeouts on the client and 5XX responses from the server.
+# Modal returns the [503 Service Unavailable status](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/503)
+# when an `experimental.http_server` has no live replicas.
+
+# We include a header with each request --
+# `Modal-Session-ID`.
+# The value associated with this key
+# is used to map requests onto containers such that
+# while the set of containers is fixed, requests with the same value
+# are sent to the same container.
+# Set this to a different value per multi-turn interaction
+# (prototypically, a user conversation thread with a chatbot)
+# to improve KV cache hit rates.
+# Note that this header is only compatible with
+# Modal `http_server`s.
+
+
+async def probe(url, messages=None, timeout=5 * MINUTES):
+    if messages is None:
+        messages = [{"role": "user", "content": "Tell me a joke."}]
+
+    client_id = str(0)
+    headers = {"Modal-Session-ID": client_id}
+    deadline = time.time() + timeout
+    async with aiohttp.ClientSession(base_url=url, headers=headers) as session:
+        while time.time() < deadline:
+            try:
+                await _send_request_streaming(session, messages)
+                return
+            except asyncio.TimeoutError:
+                await asyncio.sleep(1)
+            except aiohttp.client_exceptions.ClientResponseError as e:
+                if e.status == 503:
+                    await asyncio.sleep(1)
+                    continue
+                raise e
+    raise TimeoutError(f"No response from server within {timeout} seconds")
+
+
+async def _send_request_streaming(
+    session: aiohttp.ClientSession, messages: list, timeout: int | None = None
+) -> None:
+    payload = {"model": "llm", "messages": messages, "stream": True}
+    headers = {"Accept": "text/event-stream"}
+
+    async with session.post(
+        "/v1/chat/completions", json=payload, headers=headers, timeout=timeout
+    ) as resp:
+        resp.raise_for_status()
+        full_text = ""
+
+        async for raw in resp.content:
+            line = raw.decode("utf-8", errors="ignore").strip()
+            if not line:
+                continue
+
+            if not line.startswith("data:"):
+                continue
+
+            data = line[len("data:") :].strip()
+            if data == "[DONE]":
+                break
+
+            try:
+                evt = json.loads(data)
+            except json.JSONDecodeError:
+                continue
+
+            delta = (evt.get("choices") or [{}])[0].get("delta") or {}
+            chunk = delta.get("content")
+
+            if chunk:
+                print(chunk, end="", flush="\n" in chunk or "." in chunk)
+                full_text += chunk
+        print()
+        print(full_text)
+
+
+# ### Test memory snapshotting
+
+# Using `modal run` creates an ephemeral Modal App,
+# rather than a deployed Modal App.
+# Ephemeral Modal Apps are short-lived,
+# so they turn off snapshotting.
+
+# To test the memory snapshot version of the server,
+# first deploy it with `modal deploy`
+# and then hit it with a client.
+
+# You should observe startup improvements
+# after a handful of cold starts
+# (usually less than five).
+# If you want to see the speedup during a test,
+# we recommend heading to the deployed App in your
+# [Modal dashboard](https://modal.com/apps)
+# and manually stopping containers after they have served a request.
+
+# You can use the client code below to test the endpoint.
+# It can be run with the command
+
+# ```
+# python lfm_snapshot.py
+# ```
+
+if __name__ == "__main__":
+    LfmVllmInference = modal.Cls.from_name("examples-lfm-snapshot", "LfmVllmInference")
+
+    async def main():
+        url = LfmVllmInference._experimental_get_flash_urls()[0]
+        messages = [{"role": "user", "content": "Tell me a joke."}]
+        await probe(url, messages, timeout=10 * MINUTES)
+
+    try:
+        print("calling inference server")
+        asyncio.run(main())
+    except modal.exception.NotFoundError as e:
+        raise Exception(
+            f"To take advantage of GPU snapshots, deploy first with modal deploy {__file__}"
+        ) from e