From b2d1c067a1eb519303b4b6d1b505fc96dc82a102 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Tue, 12 Nov 2024 19:43:00 -0700
Subject: [PATCH 01/14] Add server example.

---
 examples/server/README.md        |  13 ++++
 examples/server/requirements.in  |   9 +++
 examples/server/requirements.txt | 124 +++++++++++++++++++++++++++++++
 examples/server/server.py        | 113 ++++++++++++++++++++++++++++
 4 files changed, 259 insertions(+)
 create mode 100644 examples/server/README.md
 create mode 100644 examples/server/requirements.in
 create mode 100644 examples/server/requirements.txt
 create mode 100644 examples/server/server.py

diff --git a/examples/server/README.md b/examples/server/README.md
new file mode 100644
index 000000000000..01d4961d9026
--- /dev/null
+++ b/examples/server/README.md
@@ -0,0 +1,13 @@
+
+## OpenAI Compatible `/v1/images/generations` Server
+
+This is a concurrent, multithreaded solution for running a server that can generate images using the `diffusers` library. This examples uses the Stable Diffusion 3 pipeline, but you can use any pipeline that you would like by swapping out the model and pipeline to be the ones that you want to use.
+
+The pipeline can have its dependencies installed with:
+```
+pip install -f requirements.txt
+```
+If you need to upgrade some dependencies, you can do that with either [pip-tools](https://github.com/jazzband/pip-tools) or [uv](). With `uv`, this looks like:
+```
+uv pip compile requirements.in -o requirements.txt
+```
diff --git a/examples/server/requirements.in b/examples/server/requirements.in
new file mode 100644
index 000000000000..b49b285a8fc8
--- /dev/null
+++ b/examples/server/requirements.in
@@ -0,0 +1,9 @@
+torch~=2.4.0
+transformers==4.46.1
+sentencepiece
+aiohttp
+py-consul
+prometheus_client >= 0.18.0
+prometheus-fastapi-instrumentator >= 7.0.0
+fastapi
+uvicorn
\ No newline at end of file
diff --git a/examples/server/requirements.txt b/examples/server/requirements.txt
new file mode 100644
index 000000000000..065a381f0c9b
--- /dev/null
+++ b/examples/server/requirements.txt
@@ -0,0 +1,124 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o requirements.txt
+aiohappyeyeballs==2.4.3
+    # via aiohttp
+aiohttp==3.10.10
+    # via -r requirements.in
+aiosignal==1.3.1
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.6.2.post1
+    # via starlette
+attrs==24.2.0
+    # via aiohttp
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via uvicorn
+fastapi==0.115.3
+    # via -r requirements.in
+filelock==3.16.1
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+frozenlist==1.5.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2024.10.0
+    # via
+    #   huggingface-hub
+    #   torch
+h11==0.14.0
+    # via uvicorn
+huggingface-hub==0.26.1
+    # via
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via torch
+markupsafe==3.0.2
+    # via jinja2
+mpmath==1.3.0
+    # via sympy
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+networkx==3.4.2
+    # via torch
+numpy==2.1.2
+    # via transformers
+packaging==24.1
+    # via
+    #   huggingface-hub
+    #   transformers
+prometheus-client==0.21.0
+    # via
+    #   -r requirements.in
+    #   prometheus-fastapi-instrumentator
+prometheus-fastapi-instrumentator==7.0.0
+    # via -r requirements.in
+propcache==0.2.0
+    # via yarl
+py-consul==1.5.3
+    # via -r requirements.in
+pydantic==2.9.2
+    # via fastapi
+pydantic-core==2.23.4
+    # via pydantic
+pyyaml==6.0.2
+    # via
+    #   huggingface-hub
+    #   transformers
+regex==2024.9.11
+    # via transformers
+requests==2.32.3
+    # via
+    #   huggingface-hub
+    #   py-consul
+    #   transformers
+safetensors==0.4.5
+    # via transformers
+sentencepiece==0.2.0
+    # via -r requirements.in
+sniffio==1.3.1
+    # via anyio
+starlette==0.41.0
+    # via
+    #   fastapi
+    #   prometheus-fastapi-instrumentator
+sympy==1.13.3
+    # via torch
+tokenizers==0.20.1
+    # via transformers
+torch==2.4.1
+    # via -r requirements.in
+tqdm==4.66.5
+    # via
+    #   huggingface-hub
+    #   transformers
+transformers==4.46.1
+    # via -r requirements.in
+typing-extensions==4.12.2
+    # via
+    #   fastapi
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   torch
+urllib3==2.2.3
+    # via requests
+uvicorn==0.32.0
+    # via -r requirements.in
+yarl==1.16.0
+    # via aiohttp
diff --git a/examples/server/server.py b/examples/server/server.py
new file mode 100644
index 000000000000..08e8b6fa801f
--- /dev/null
+++ b/examples/server/server.py
@@ -0,0 +1,113 @@
+from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import aiohttp, asyncio, logging, os, random, sys, tempfile, torch, traceback, uuid
+from diffusers.pipelines.stable_diffusion_3 import StableDiffusion3Pipeline
+
+logger = logging.getLogger(__name__)
+
+class TextToImageInput(BaseModel):
+    model: str
+    prompt: str
+    size: str | None = None
+    n: int | None = None
+
+class HttpClient:
+    session: aiohttp.ClientSession = None
+
+    def start(self):
+        self.session = aiohttp.ClientSession()
+
+    async def stop(self):
+        await self.session.close()
+        self.session = None
+
+    def __call__(self) -> aiohttp.ClientSession:
+        assert self.session is not None
+        return self.session
+    
+class TextToImagePipeline:
+    pipeline: StableDiffusion3Pipeline = None
+    device: str = None
+
+    def start(self):
+        if torch.cuda.is_available():
+            model_path = os.getenv("MODEL_PATH", "stabilityai/stable-diffusion-3.5-large")
+            logger.info("Loading CUDA")
+            self.device = "cuda"
+            self.pipeline = StableDiffusion3Pipeline.from_pretrained(
+                model_path, 
+                torch_dtype=torch.bfloat16,
+            ).to(device=self.device)
+        elif torch.backends.mps.is_available():
+            model_path = os.getenv("MODEL_PATH", "stabilityai/stable-diffusion-3.5-medium")
+            logger.info("Loading MPS for Mac M Series")
+            self.device = "mps"
+            self.pipeline = StableDiffusion3Pipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+            ).to(device=self.device)
+        else:
+            raise Exception("No CUDA or MPS device available")
+    
+app = FastAPI()
+service_url = os.getenv("SERVICE_URL", "http://localhost:8000")
+image_dir = os.path.join(tempfile.gettempdir(), "images")
+if not os.path.exists(image_dir):
+    os.makedirs(image_dir)
+app.mount("/images", StaticFiles(directory=image_dir), name="images")
+http_client = HttpClient()
+shared_pipeline = TextToImagePipeline()
+
+# Configure CORS settings
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods, e.g., GET, POST, OPTIONS, etc.
+    allow_headers=["*"],  # Allows all headers
+)
+
+@app.on_event("startup")
+def startup():
+    http_client.start()
+    shared_pipeline.start()
+
+def save_image(image):
+    filename = "draw" + str(uuid.uuid4()).split('-')[0] + ".png"
+    image_path = os.path.join(image_dir, filename)
+    # write image to disk at image_path
+    logger.info(f"Saving image to {image_path}")
+    image.save(image_path)
+    return os.path.join(service_url, "images", filename)
+
+@app.get('/')
+@app.post('/')
+@app.options('/')
+async def base():
+    return "Welcome to Diffusers! Where you can use diffusion models to generate images"
+
+@app.post("/v1/images/generations")
+async def generate_image(image_input: TextToImageInput):
+    try:
+        loop = asyncio.get_event_loop()
+        scheduler = shared_pipeline.pipeline.scheduler.from_config(shared_pipeline.pipeline.scheduler.config)
+        pipeline = StableDiffusion3Pipeline.from_pipe(shared_pipeline.pipeline, scheduler=scheduler)
+        generator =torch.Generator(device="cuda")
+        generator.manual_seed(random.randint(0, 10000000))
+        output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
+        logger.info(f"output: {output}")
+        image_url = save_image(output.images[0])
+        return {"data": [{"url": image_url}]}
+    except Exception as e:
+        if isinstance(e, HTTPException):
+            raise e
+        elif hasattr(e, 'message'):
+            raise HTTPException(status_code=500, detail=e.message + traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e) + traceback.format_exc())
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file

From 040806c4be05a8660359e08157d4fcb974319732 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Tue, 12 Nov 2024 19:44:36 -0700
Subject: [PATCH 02/14] Minor updates to README.

---
 examples/server/README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 01d4961d9026..91cac212babc 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -3,11 +3,16 @@
 
 This is a concurrent, multithreaded solution for running a server that can generate images using the `diffusers` library. This examples uses the Stable Diffusion 3 pipeline, but you can use any pipeline that you would like by swapping out the model and pipeline to be the ones that you want to use.
 
+### Installing Dependencies
+
 The pipeline can have its dependencies installed with:
 ```
 pip install -f requirements.txt
 ```
-If you need to upgrade some dependencies, you can do that with either [pip-tools](https://github.com/jazzband/pip-tools) or [uv](). With `uv`, this looks like:
+
+### Upgrading Dependencies
+
+If you need to upgrade some dependencies, you can do that with either [pip-tools](https://github.com/jazzband/pip-tools) or [uv](https://github.com/astral-sh/uv). With `uv`, this looks like:
 ```
 uv pip compile requirements.in -o requirements.txt
 ```

From a91feecd5b21de4795f275bffed9d3d5c63476a8 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Tue, 12 Nov 2024 20:07:10 -0700
Subject: [PATCH 03/14] Add fixes after local testing.

---
 examples/server/README.md | 19 ++++++++++++++++++-
 examples/server/server.py |  2 +-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 91cac212babc..1bfdf2050f15 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,14 +5,31 @@ This is a concurrent, multithreaded solution for running a server that can gener
 
 ### Installing Dependencies
 
-The pipeline can have its dependencies installed with:
+Start by going to the base of the repo and installing it with:
+``py
+pip install .
 ```
+
+The pipeline can then have its dependencies installed with:
+```py
 pip install -f requirements.txt
 ```
 
+### Running the server
+
+This server can be run with:
+```py
+python server.py
+```
+The server will be spun up at http://localhost:8000. You can `curl` this model with the following command:
+```
+curl -X POST -H "Content-Type: application/json" --data '{"model": "something", "prompt": "a kitten in front of a fireplace"}' http://localhost:8000/v1/images/generations
+```
+
 ### Upgrading Dependencies
 
 If you need to upgrade some dependencies, you can do that with either [pip-tools](https://github.com/jazzband/pip-tools) or [uv](https://github.com/astral-sh/uv). With `uv`, this looks like:
 ```
 uv pip compile requirements.in -o requirements.txt
 ```
+
diff --git a/examples/server/server.py b/examples/server/server.py
index 08e8b6fa801f..7e1968de58ac 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -94,7 +94,7 @@ async def generate_image(image_input: TextToImageInput):
         loop = asyncio.get_event_loop()
         scheduler = shared_pipeline.pipeline.scheduler.from_config(shared_pipeline.pipeline.scheduler.config)
         pipeline = StableDiffusion3Pipeline.from_pipe(shared_pipeline.pipeline, scheduler=scheduler)
-        generator =torch.Generator(device="cuda")
+        generator =torch.Generator(device=shared_pipeline.device)
         generator.manual_seed(random.randint(0, 10000000))
         output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
         logger.info(f"output: {output}")

From be39807413de650bff575066e55f11c78521d54f Mon Sep 17 00:00:00 2001
From: Grant Sherrick <agsherrick@gmail.com>
Date: Wed, 13 Nov 2024 15:31:27 -0700
Subject: [PATCH 04/14] Apply suggestions from code review

Updates to README from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 examples/server/README.md | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 1bfdf2050f15..15767b4cbdb8 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -1,35 +1,26 @@
 
-## OpenAI Compatible `/v1/images/generations` Server
+# Create a server
 
-This is a concurrent, multithreaded solution for running a server that can generate images using the `diffusers` library. This examples uses the Stable Diffusion 3 pipeline, but you can use any pipeline that you would like by swapping out the model and pipeline to be the ones that you want to use.
+Diffusers' pipelines can be used as an inference engine for a server. It supports concurrent and multithreaded requests to generate images that may be requested by multiple users at the same time.
 
-### Installing Dependencies
+This guide will show you how to use the [`StableDiffusion3Pipeline`] in a server, but feel free to use any pipeline you want.
+
+
+Start by navigating to the `examples/server` folder and installing all the dependencies.
 
-Start by going to the base of the repo and installing it with:
 ``py
 pip install .
-```
-
-The pipeline can then have its dependencies installed with:
-```py
 pip install -f requirements.txt
-```
 
-### Running the server
 
-This server can be run with:
+
+Launch the server with the following command.
+
 ```py
 python server.py
-```
-The server will be spun up at http://localhost:8000. You can `curl` this model with the following command:
-```
-curl -X POST -H "Content-Type: application/json" --data '{"model": "something", "prompt": "a kitten in front of a fireplace"}' http://localhost:8000/v1/images/generations
-```
 
-### Upgrading Dependencies
 
-If you need to upgrade some dependencies, you can do that with either [pip-tools](https://github.com/jazzband/pip-tools) or [uv](https://github.com/astral-sh/uv). With `uv`, this looks like:
-```
-uv pip compile requirements.in -o requirements.txt
+If you need to upgrade some dependencies, you can use either [pip-tools](https://github.com/jazzband/pip-tools) or [uv](https://github.com/astral-sh/uv). For example, upgrade the dependencies with `uv` using the following command.
+
 ```
 

From 3bf2c494cd338d885622e29fa57aa822d921bd25 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Wed, 13 Nov 2024 15:38:28 -0700
Subject: [PATCH 05/14] More doc updates.

---
 examples/server/README.md | 42 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 15767b4cbdb8..b2201c7f50cc 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -6,21 +6,57 @@ Diffusers' pipelines can be used as an inference engine for a server. It support
 This guide will show you how to use the [`StableDiffusion3Pipeline`] in a server, but feel free to use any pipeline you want.
 
 
-Start by navigating to the `examples/server` folder and installing all the dependencies.
+Start by navigating to the `examples/server` folder and installing all of the dependencies.
 
 ``py
 pip install .
 pip install -f requirements.txt
-
-
+```
 
 Launch the server with the following command.
 
 ```py
 python server.py
+```
 
+The server is accessed at http://localhost:8000. You can curl this model with the following command.
+```
+curl -X POST -H "Content-Type: application/json" --data '{"model": "something", "prompt": "a kitten in front of a fireplace"}' http://localhost:8000/v1/images/generations
+```
 
 If you need to upgrade some dependencies, you can use either [pip-tools](https://github.com/jazzband/pip-tools) or [uv](https://github.com/astral-sh/uv). For example, upgrade the dependencies with `uv` using the following command.
 
 ```
+uv pip compile requirements.in -o requirements.txt
+```
+
+## How does this Server Work?
+
+The server is built with [FastAPI](https://fastapi.tiangolo.com/async/). The endpoint for `v1/images/generations` is defined like this:
+```py
+@app.post("/v1/images/generations")
+async def generate_image(image_input: TextToImageInput):
+    try:
+        loop = asyncio.get_event_loop()
+        scheduler = shared_pipeline.pipeline.scheduler.from_config(shared_pipeline.pipeline.scheduler.config)
+        pipeline = StableDiffusion3Pipeline.from_pipe(shared_pipeline.pipeline, scheduler=scheduler)
+        generator =torch.Generator(device="cuda")
+        generator.manual_seed(random.randint(0, 10000000))
+        output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
+        logger.info(f"output: {output}")
+        image_url = save_image(output.images[0])
+        return {"data": [{"url": image_url}]}
+    except Exception as e:
+        if isinstance(e, HTTPException):
+            raise e
+        elif hasattr(e, 'message'):
+            raise HTTPException(status_code=500, detail=e.message + traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e) + traceback.format_exc())
+```
+Above, the `generate_image` function is defined as asynchronous with the `async` keyword so that [FastAPI](https://fastapi.tiangolo.com/async/) knows that whatever is happening in this function is not going to necessarily return a result right away. Once it hits some point in the function that it needs to await some other [Task](https://docs.python.org/3/library/asyncio-task.html#asyncio.Task), the main thread goes back to answering other HTTP requests. For us, this happens when it hits this part of the function:
+```py
+output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
+```
+At this point, we are tossing the execution of the pipeline function [onto a new thread](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor) and the main thread knows to go and do some other things until a result is returned from the `pipeline`.
 
+Another important aspect of this implementation is the portion which creates a Pipeline from the `shared_pipeline`. The goal behind this is to avoid loading the underlying model more than once into the GPU while still allowing for each new request that is running on its own thread to have its own generator and scheduler. The scheduler in particular, at the time of this writing (November 2024), is not thread safe, and it will cause errors like: `IndexError: index 21 is out of bounds for dimension 0 with size 21` if you do try to use the same scheduler across multiple threads.

From 4f8ba1771679b203da74f7399c9d84630462310c Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Wed, 13 Nov 2024 15:50:35 -0700
Subject: [PATCH 06/14] Maybe this will work to build the docs correctly?

---
 docs/source/en/_toctree.yml                   |  2 +
 .../en/using-diffusers/create_a_server.md     | 61 +++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 docs/source/en/using-diffusers/create_a_server.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index de6cd2981b96..24920cb71134 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -25,6 +25,8 @@
     title: Working with big models
   title: Tutorials
 - sections:
+  - local: user-diffusers/create_a_server
+    title: Create a server
   - local: using-diffusers/loading
     title: Load pipelines
   - local: using-diffusers/custom_pipeline_overview
diff --git a/docs/source/en/using-diffusers/create_a_server.md b/docs/source/en/using-diffusers/create_a_server.md
new file mode 100644
index 000000000000..ffa3a12898c2
--- /dev/null
+++ b/docs/source/en/using-diffusers/create_a_server.md
@@ -0,0 +1,61 @@
+# Create a server
+
+Diffusers' pipelines can be used as an inference engine for a server. It supports concurrent and multithreaded requests to generate images that may be requested by multiple users at the same time.
+
+This guide will show you how to use the [`StableDiffusion3Pipeline`] in a server, but feel free to use any pipeline you want.
+
+
+Start by navigating to the `examples/server` folder and installing all of the dependencies.
+
+``py
+pip install .
+pip install -f requirements.txt
+```
+
+Launch the server with the following command.
+
+```py
+python server.py
+```
+
+The server is accessed at http://localhost:8000. You can curl this model with the following command.
+```
+curl -X POST -H "Content-Type: application/json" --data '{"model": "something", "prompt": "a kitten in front of a fireplace"}' http://localhost:8000/v1/images/generations
+```
+
+If you need to upgrade some dependencies, you can use either [pip-tools](https://github.com/jazzband/pip-tools) or [uv](https://github.com/astral-sh/uv). For example, upgrade the dependencies with `uv` using the following command.
+
+```
+uv pip compile requirements.in -o requirements.txt
+```
+
+## How does this Server Work?
+
+The server is built with [FastAPI](https://fastapi.tiangolo.com/async/). The endpoint for `v1/images/generations` is defined like this:
+```py
+@app.post("/v1/images/generations")
+async def generate_image(image_input: TextToImageInput):
+    try:
+        loop = asyncio.get_event_loop()
+        scheduler = shared_pipeline.pipeline.scheduler.from_config(shared_pipeline.pipeline.scheduler.config)
+        pipeline = StableDiffusion3Pipeline.from_pipe(shared_pipeline.pipeline, scheduler=scheduler)
+        generator =torch.Generator(device="cuda")
+        generator.manual_seed(random.randint(0, 10000000))
+        output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
+        logger.info(f"output: {output}")
+        image_url = save_image(output.images[0])
+        return {"data": [{"url": image_url}]}
+    except Exception as e:
+        if isinstance(e, HTTPException):
+            raise e
+        elif hasattr(e, 'message'):
+            raise HTTPException(status_code=500, detail=e.message + traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e) + traceback.format_exc())
+```
+Above, the `generate_image` function is defined as asynchronous with the `async` keyword so that [FastAPI](https://fastapi.tiangolo.com/async/) knows that whatever is happening in this function is not going to necessarily return a result right away. Once it hits some point in the function that it needs to await some other [Task](https://docs.python.org/3/library/asyncio-task.html#asyncio.Task), the main thread goes back to answering other HTTP requests. For us, this happens when it hits this part of the function:
+```py
+output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
+```
+At this point, we are tossing the execution of the pipeline function [onto a new thread](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor) and the main thread knows to go and do some other things until a result is returned from the `pipeline`.
+
+Another important aspect of this implementation is the portion which creates a Pipeline from the `shared_pipeline`. The goal behind this is to avoid loading the underlying model more than once into the GPU while still allowing for each new request that is running on its own thread to have its own generator and scheduler. The scheduler in particular, at the time of this writing (November 2024), is not thread safe, and it will cause errors like: `IndexError: index 21 is out of bounds for dimension 0 with size 21` if you do try to use the same scheduler across multiple threads.

From 82ab9977be4f61de1c63bd198865bebc9257cdaa Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Wed, 13 Nov 2024 15:57:13 -0700
Subject: [PATCH 07/14] Fix style issues.

---
 examples/server/server.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/examples/server/server.py b/examples/server/server.py
index 7e1968de58ac..fee3be1b1c36 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -1,10 +1,21 @@
+import asyncio
+import logging
+import os
+import random
+import tempfile
+import traceback
+import uuid
+
+import aiohttp
+import torch
 from fastapi import FastAPI, HTTPException
-from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
-import aiohttp, asyncio, logging, os, random, sys, tempfile, torch, traceback, uuid
+
 from diffusers.pipelines.stable_diffusion_3 import StableDiffusion3Pipeline
 
+
 logger = logging.getLogger(__name__)
 
 class TextToImageInput(BaseModel):
@@ -26,7 +37,7 @@ async def stop(self):
     def __call__(self) -> aiohttp.ClientSession:
         assert self.session is not None
         return self.session
-    
+
 class TextToImagePipeline:
     pipeline: StableDiffusion3Pipeline = None
     device: str = None
@@ -37,7 +48,7 @@ def start(self):
             logger.info("Loading CUDA")
             self.device = "cuda"
             self.pipeline = StableDiffusion3Pipeline.from_pretrained(
-                model_path, 
+                model_path,
                 torch_dtype=torch.bfloat16,
             ).to(device=self.device)
         elif torch.backends.mps.is_available():
@@ -50,7 +61,7 @@ def start(self):
             ).to(device=self.device)
         else:
             raise Exception("No CUDA or MPS device available")
-    
+
 app = FastAPI()
 service_url = os.getenv("SERVICE_URL", "http://localhost:8000")
 image_dir = os.path.join(tempfile.gettempdir(), "images")
@@ -110,4 +121,4 @@ async def generate_image(image_input: TextToImageInput):
 if __name__ == "__main__":
     import uvicorn
 
-    uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file
+    uvicorn.run(app, host="0.0.0.0", port=8000)

From 71f363856528ba65f958a6986a65606f984b5445 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Wed, 13 Nov 2024 16:40:23 -0700
Subject: [PATCH 08/14] Fix toc.

---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 24920cb71134..0459c1d9a818 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -25,7 +25,7 @@
     title: Working with big models
   title: Tutorials
 - sections:
-  - local: user-diffusers/create_a_server
+  - local: using-diffusers/create_a_server
     title: Create a server
   - local: using-diffusers/loading
     title: Load pipelines

From 36948da5b89ec8e2ce89dfe3fc74fb1e33412824 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Wed, 13 Nov 2024 18:40:11 -0700
Subject: [PATCH 09/14] Minor reformatting.

---
 examples/server/server.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/examples/server/server.py b/examples/server/server.py
index fee3be1b1c36..f8c9bd60d4bf 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -18,12 +18,14 @@
 
 logger = logging.getLogger(__name__)
 
+
 class TextToImageInput(BaseModel):
     model: str
     prompt: str
     size: str | None = None
     n: int | None = None
 
+
 class HttpClient:
     session: aiohttp.ClientSession = None
 
@@ -38,6 +40,7 @@ def __call__(self) -> aiohttp.ClientSession:
         assert self.session is not None
         return self.session
 
+
 class TextToImagePipeline:
     pipeline: StableDiffusion3Pipeline = None
     device: str = None
@@ -62,6 +65,7 @@ def start(self):
         else:
             raise Exception("No CUDA or MPS device available")
 
+
 app = FastAPI()
 service_url = os.getenv("SERVICE_URL", "http://localhost:8000")
 image_dir = os.path.join(tempfile.gettempdir(), "images")
@@ -80,44 +84,49 @@ def start(self):
     allow_headers=["*"],  # Allows all headers
 )
 
+
 @app.on_event("startup")
 def startup():
     http_client.start()
     shared_pipeline.start()
 
+
 def save_image(image):
-    filename = "draw" + str(uuid.uuid4()).split('-')[0] + ".png"
+    filename = "draw" + str(uuid.uuid4()).split("-")[0] + ".png"
     image_path = os.path.join(image_dir, filename)
     # write image to disk at image_path
     logger.info(f"Saving image to {image_path}")
     image.save(image_path)
     return os.path.join(service_url, "images", filename)
 
-@app.get('/')
-@app.post('/')
-@app.options('/')
+
+@app.get("/")
+@app.post("/")
+@app.options("/")
 async def base():
     return "Welcome to Diffusers! Where you can use diffusion models to generate images"
 
+
 @app.post("/v1/images/generations")
 async def generate_image(image_input: TextToImageInput):
     try:
         loop = asyncio.get_event_loop()
         scheduler = shared_pipeline.pipeline.scheduler.from_config(shared_pipeline.pipeline.scheduler.config)
         pipeline = StableDiffusion3Pipeline.from_pipe(shared_pipeline.pipeline, scheduler=scheduler)
-        generator =torch.Generator(device=shared_pipeline.device)
+        generator = torch.Generator(device=shared_pipeline.device)
         generator.manual_seed(random.randint(0, 10000000))
-        output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
+        output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator=generator))
         logger.info(f"output: {output}")
         image_url = save_image(output.images[0])
         return {"data": [{"url": image_url}]}
     except Exception as e:
         if isinstance(e, HTTPException):
             raise e
-        elif hasattr(e, 'message'):
+        elif hasattr(e, "message"):
             raise HTTPException(status_code=500, detail=e.message + traceback.format_exc())
         raise HTTPException(status_code=500, detail=str(e) + traceback.format_exc())
 
+
 if __name__ == "__main__":
     import uvicorn
 

From c4733dba0dd4f2cec28946d20d39cdc2e2016d85 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Sat, 16 Nov 2024 07:30:16 -0600
Subject: [PATCH 10/14] Move docs to proper loc.

---
 docs/source/en/_toctree.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 0459c1d9a818..2faabfec30ce 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -25,8 +25,6 @@
     title: Working with big models
   title: Tutorials
 - sections:
-  - local: using-diffusers/create_a_server
-    title: Create a server
   - local: using-diffusers/loading
     title: Load pipelines
   - local: using-diffusers/custom_pipeline_overview
@@ -57,6 +55,8 @@
 - sections:
   - local: using-diffusers/overview_techniques
     title: Overview
+  - local: using-diffusers/create_a_server
+    title: Create a server
   - local: training/distributed_inference
     title: Distributed inference
   - local: using-diffusers/merge_loras

From 468bec943317d1b7c94b47460c04400994a57c72 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Sat, 16 Nov 2024 07:32:03 -0600
Subject: [PATCH 11/14] Fix missing tick.

---
 docs/source/en/using-diffusers/create_a_server.md | 3 ++-
 examples/server/README.md                         | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/using-diffusers/create_a_server.md b/docs/source/en/using-diffusers/create_a_server.md
index ffa3a12898c2..fcba35f4a758 100644
--- a/docs/source/en/using-diffusers/create_a_server.md
+++ b/docs/source/en/using-diffusers/create_a_server.md
@@ -1,3 +1,4 @@
+
 # Create a server
 
 Diffusers' pipelines can be used as an inference engine for a server. It supports concurrent and multithreaded requests to generate images that may be requested by multiple users at the same time.
@@ -7,7 +8,7 @@ This guide will show you how to use the [`StableDiffusion3Pipeline`] in a server
 
 Start by navigating to the `examples/server` folder and installing all of the dependencies.
 
-``py
+```py
 pip install .
 pip install -f requirements.txt
 ```
diff --git a/examples/server/README.md b/examples/server/README.md
index b2201c7f50cc..fcba35f4a758 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -8,7 +8,7 @@ This guide will show you how to use the [`StableDiffusion3Pipeline`] in a server
 
 Start by navigating to the `examples/server` folder and installing all of the dependencies.
 
-``py
+```py
 pip install .
 pip install -f requirements.txt
 ```

From a0bf884dad0877947121276c4e8b2237b386d76f Mon Sep 17 00:00:00 2001
From: Grant Sherrick <agsherrick@gmail.com>
Date: Sat, 16 Nov 2024 07:36:33 -0600
Subject: [PATCH 12/14] Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/using-diffusers/create_a_server.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/using-diffusers/create_a_server.md b/docs/source/en/using-diffusers/create_a_server.md
index fcba35f4a758..5be3b9cbf270 100644
--- a/docs/source/en/using-diffusers/create_a_server.md
+++ b/docs/source/en/using-diffusers/create_a_server.md
@@ -30,9 +30,8 @@ If you need to upgrade some dependencies, you can use either [pip-tools](https:/
 uv pip compile requirements.in -o requirements.txt
 ```
 
-## How does this Server Work?
 
-The server is built with [FastAPI](https://fastapi.tiangolo.com/async/). The endpoint for `v1/images/generations` is defined like this:
+The server is built with [FastAPI](https://fastapi.tiangolo.com/async/). The endpoint for `v1/images/generations` is shown below.
 ```py
 @app.post("/v1/images/generations")
 async def generate_image(image_input: TextToImageInput):
@@ -53,10 +52,10 @@ async def generate_image(image_input: TextToImageInput):
             raise HTTPException(status_code=500, detail=e.message + traceback.format_exc())
         raise HTTPException(status_code=500, detail=str(e) + traceback.format_exc())
 ```
-Above, the `generate_image` function is defined as asynchronous with the `async` keyword so that [FastAPI](https://fastapi.tiangolo.com/async/) knows that whatever is happening in this function is not going to necessarily return a result right away. Once it hits some point in the function that it needs to await some other [Task](https://docs.python.org/3/library/asyncio-task.html#asyncio.Task), the main thread goes back to answering other HTTP requests. For us, this happens when it hits this part of the function:
+The `generate_image` function is defined as asynchronous with the [async](https://fastapi.tiangolo.com/async/) keyword so that FastAPI knows that whatever is happening in this function won't necessarily return a result right away. Once it hits some point in the function that it needs to await some other [Task](https://docs.python.org/3/library/asyncio-task.html#asyncio.Task), the main thread goes back to answering other HTTP requests. This is shown in the code below with the [await](https://fastapi.tiangolo.com/async/#async-and-await) keyword.
 ```py
 output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
 ```
-At this point, we are tossing the execution of the pipeline function [onto a new thread](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor) and the main thread knows to go and do some other things until a result is returned from the `pipeline`.
+At this point, the execution of the pipeline function is placed onto a [new thread](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor), and the main thread performs other things until a result is returned from the `pipeline`.
 
-Another important aspect of this implementation is the portion which creates a Pipeline from the `shared_pipeline`. The goal behind this is to avoid loading the underlying model more than once into the GPU while still allowing for each new request that is running on its own thread to have its own generator and scheduler. The scheduler in particular, at the time of this writing (November 2024), is not thread safe, and it will cause errors like: `IndexError: index 21 is out of bounds for dimension 0 with size 21` if you do try to use the same scheduler across multiple threads.
+Another important aspect of this implementation is creating a `pipeline` from `shared_pipeline`. The goal behind this is to avoid loading the underlying model more than once onto the GPU while still allowing for each new request that is running on a separate thread to have its own generator and scheduler. The scheduler, in particular, is not thread-safe, and it will cause errors like: `IndexError: index 21 is out of bounds for dimension 0 with size 21` if you try to use the same scheduler across multiple threads.

From 0c1110135046a542d95d1bbaca02107079e93846 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Sat, 16 Nov 2024 07:37:55 -0600
Subject: [PATCH 13/14] Sync docs changes back to README.

---
 examples/server/README.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index fcba35f4a758..5be3b9cbf270 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -30,9 +30,8 @@ If you need to upgrade some dependencies, you can use either [pip-tools](https:/
 uv pip compile requirements.in -o requirements.txt
 ```
 
-## How does this Server Work?
 
-The server is built with [FastAPI](https://fastapi.tiangolo.com/async/). The endpoint for `v1/images/generations` is defined like this:
+The server is built with [FastAPI](https://fastapi.tiangolo.com/async/). The endpoint for `v1/images/generations` is shown below.
 ```py
 @app.post("/v1/images/generations")
 async def generate_image(image_input: TextToImageInput):
@@ -53,10 +52,10 @@ async def generate_image(image_input: TextToImageInput):
             raise HTTPException(status_code=500, detail=e.message + traceback.format_exc())
         raise HTTPException(status_code=500, detail=str(e) + traceback.format_exc())
 ```
-Above, the `generate_image` function is defined as asynchronous with the `async` keyword so that [FastAPI](https://fastapi.tiangolo.com/async/) knows that whatever is happening in this function is not going to necessarily return a result right away. Once it hits some point in the function that it needs to await some other [Task](https://docs.python.org/3/library/asyncio-task.html#asyncio.Task), the main thread goes back to answering other HTTP requests. For us, this happens when it hits this part of the function:
+The `generate_image` function is defined as asynchronous with the [async](https://fastapi.tiangolo.com/async/) keyword so that FastAPI knows that whatever is happening in this function won't necessarily return a result right away. Once it hits some point in the function that it needs to await some other [Task](https://docs.python.org/3/library/asyncio-task.html#asyncio.Task), the main thread goes back to answering other HTTP requests. This is shown in the code below with the [await](https://fastapi.tiangolo.com/async/#async-and-await) keyword.
 ```py
 output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
 ```
-At this point, we are tossing the execution of the pipeline function [onto a new thread](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor) and the main thread knows to go and do some other things until a result is returned from the `pipeline`.
+At this point, the execution of the pipeline function is placed onto a [new thread](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor), and the main thread performs other things until a result is returned from the `pipeline`.
 
-Another important aspect of this implementation is the portion which creates a Pipeline from the `shared_pipeline`. The goal behind this is to avoid loading the underlying model more than once into the GPU while still allowing for each new request that is running on its own thread to have its own generator and scheduler. The scheduler in particular, at the time of this writing (November 2024), is not thread safe, and it will cause errors like: `IndexError: index 21 is out of bounds for dimension 0 with size 21` if you do try to use the same scheduler across multiple threads.
+Another important aspect of this implementation is creating a `pipeline` from `shared_pipeline`. The goal behind this is to avoid loading the underlying model more than once onto the GPU while still allowing for each new request that is running on a separate thread to have its own generator and scheduler. The scheduler, in particular, is not thread-safe, and it will cause errors like: `IndexError: index 21 is out of bounds for dimension 0 with size 21` if you try to use the same scheduler across multiple threads.

From 31711a72c3dee005ef5481399b893f3a26af6455 Mon Sep 17 00:00:00 2001
From: asherrick <asherrick@etsy.com>
Date: Sat, 16 Nov 2024 07:40:39 -0600
Subject: [PATCH 14/14] Very minor update to docs to add space.

---
 docs/source/en/using-diffusers/create_a_server.md | 2 +-
 examples/server/README.md                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/using-diffusers/create_a_server.md b/docs/source/en/using-diffusers/create_a_server.md
index 5be3b9cbf270..8ad0ed3cbe6a 100644
--- a/docs/source/en/using-diffusers/create_a_server.md
+++ b/docs/source/en/using-diffusers/create_a_server.md
@@ -39,7 +39,7 @@ async def generate_image(image_input: TextToImageInput):
         loop = asyncio.get_event_loop()
         scheduler = shared_pipeline.pipeline.scheduler.from_config(shared_pipeline.pipeline.scheduler.config)
         pipeline = StableDiffusion3Pipeline.from_pipe(shared_pipeline.pipeline, scheduler=scheduler)
-        generator =torch.Generator(device="cuda")
+        generator = torch.Generator(device="cuda")
         generator.manual_seed(random.randint(0, 10000000))
         output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
         logger.info(f"output: {output}")
diff --git a/examples/server/README.md b/examples/server/README.md
index 5be3b9cbf270..8ad0ed3cbe6a 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -39,7 +39,7 @@ async def generate_image(image_input: TextToImageInput):
         loop = asyncio.get_event_loop()
         scheduler = shared_pipeline.pipeline.scheduler.from_config(shared_pipeline.pipeline.scheduler.config)
         pipeline = StableDiffusion3Pipeline.from_pipe(shared_pipeline.pipeline, scheduler=scheduler)
-        generator =torch.Generator(device="cuda")
+        generator = torch.Generator(device="cuda")
         generator.manual_seed(random.randint(0, 10000000))
         output = await loop.run_in_executor(None, lambda: pipeline(image_input.prompt, generator = generator))
         logger.info(f"output: {output}")