[SERVING] Add FLUX.2 LoRA tests& support sigmas (#674)

BBuf · web-flow · commit 06c568e262db · 2026-01-09T18:05:53.000+08:00
* flux2 turbo lora

* flux2 turbo lora
diff --git a/src/cache_dit/serve/api_server.py b/src/cache_dit/serve/api_server.py
@@ -39,6 +39,10 @@ class GenerateRequestAPI(BaseModel):
     height: int = Field(1024, description="Image/Video height", ge=64, le=4096)
     num_inference_steps: int = Field(50, description="Number of inference steps", ge=1, le=200)
     guidance_scale: float = Field(7.5, description="Guidance scale", ge=0.0, le=20.0)
+    sigmas: Optional[List[float]] = Field(
+        None,
+        description="Custom sigma schedule (e.g. for turbo inference). Length should typically match num_inference_steps.",
+    )
     seed: Optional[int] = Field(None, description="Random seed")
     num_images: int = Field(1, description="Number of images to generate", ge=1, le=4)
     image_urls: Optional[List[str]] = Field(
@@ -120,6 +124,7 @@ async def generate(request: GenerateRequestAPI):
                     height=request.height,
                     num_inference_steps=request.num_inference_steps,
                     guidance_scale=request.guidance_scale,
+                    sigmas=request.sigmas,
                     seed=request.seed,
                     num_images=request.num_images,
                     image_urls=request.image_urls,
diff --git a/src/cache_dit/serve/model_manager.py b/src/cache_dit/serve/model_manager.py
@@ -7,6 +7,7 @@
 import os
 import time
 import base64
+import inspect
 import tempfile
 import math
 import torch
@@ -83,6 +84,7 @@ class GenerateRequest:
     height: int = 1024
     num_inference_steps: int = 50
     guidance_scale: float = 7.5
+    sigmas: Optional[List[float]] = None
     seed: Optional[int] = None
     num_images: int = 1
     image_urls: Optional[List[str]] = None
@@ -594,6 +596,16 @@ def generate(self, request: GenerateRequest) -> GenerateResponse:
             "generator": generator,
         }
 
+        if request.sigmas is not None:
+            try:
+                sig = inspect.signature(self.pipe.__call__)
+                if "sigmas" in sig.parameters:
+                    pipe_kwargs["sigmas"] = request.sigmas
+                else:
+                    logger.warning("Pipeline does not support sigmas, ignoring request.sigmas")
+            except Exception:
+                pipe_kwargs["sigmas"] = request.sigmas
+
         # Add num_frames for video generation
         if is_video_mode:
             pipe_kwargs["num_frames"] = request.num_frames
@@ -614,8 +626,6 @@ def generate(self, request: GenerateRequest) -> GenerateResponse:
         # Some pipelines (like Flux2Pipeline) don't support negative_prompt
         if request.negative_prompt:
             try:
-                import inspect
-
                 sig = inspect.signature(self.pipe.__call__)
                 if "negative_prompt" in sig.parameters:
                     pipe_kwargs["negative_prompt"] = request.negative_prompt
diff --git a/tests/serving/test_flux2_turbo_lora_serving.py b/tests/serving/test_flux2_turbo_lora_serving.py
@@ -0,0 +1,104 @@
+"""Test FLUX.2 Turbo LoRA model serving.
+
+Server setup:
+    CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun --nproc_per_node=4 \
+        -m cache_dit.serve.serve \
+        --model-path black-forest-labs/FLUX.2-dev \
+        --lora-path fal/FLUX.2-dev-Turbo \
+        --lora-name flux.2-turbo-lora.safetensors \
+        --parallel-type ulysses \
+        --parallel-text-encoder \
+        --quantize-type float8_wo \
+        --attn _flash_3 \
+        --cache \
+        --compile \
+        --ulysses-anything
+
+This test calls /generate with a custom sigma schedule (TURBO_SIGMAS) for 8-step turbo inference.
+
+Reference LoRA: https://huggingface.co/fal/FLUX.2-dev-Turbo
+Base model: https://huggingface.co/black-forest-labs/FLUX.2-dev
+"""
+
+import os
+import requests
+import base64
+from PIL import Image
+from io import BytesIO
+
+
+# Pre-shifted custom sigmas for 8-step turbo inference
+TURBO_SIGMAS = [1.0, 0.6509, 0.4374, 0.2932, 0.1893, 0.1108, 0.0495, 0.00031]
+
+
+def call_api(prompt, name="flux2_turbo", **kwargs):
+    host = os.environ.get("CACHE_DIT_HOST", "localhost")
+    port = int(os.environ.get("CACHE_DIT_PORT", 8000))
+    url = f"http://{host}:{port}/generate"
+
+    payload = {
+        "prompt": prompt,
+        "width": kwargs.get("width", 1024),
+        "height": kwargs.get("height", 1024),
+        "num_inference_steps": kwargs.get("num_inference_steps", 8),
+        "guidance_scale": kwargs.get("guidance_scale", 2.5),
+        "sigmas": kwargs.get("sigmas", TURBO_SIGMAS),
+        "seed": kwargs.get("seed", 42),
+        "num_images": kwargs.get("num_images", 1),
+    }
+
+    if "output_format" in kwargs:
+        payload["output_format"] = kwargs["output_format"]
+    if "output_dir" in kwargs:
+        payload["output_dir"] = kwargs["output_dir"]
+
+    response = requests.post(url, json=payload, timeout=600)
+    response.raise_for_status()
+    result = response.json()
+
+    assert "images" in result and result["images"], "No images in response"
+
+    if payload.get("output_format", "base64") == "path":
+        filename = result["images"][0]
+        assert os.path.exists(filename)
+        img = Image.open(filename)
+        print(f"Saved: {filename} ({img.size[0]}x{img.size[1]})")
+        return filename
+
+    img_data = base64.b64decode(result["images"][0])
+    img = Image.open(BytesIO(img_data))
+
+    filename = f"{name}.png"
+    img.save(filename)
+    print(f"Saved: {filename} ({img.size[0]}x{img.size[1]})")
+    return filename
+
+
+def test_flux2_turbo_lora():
+    prompt = (
+        "Industrial product shot of a chrome turbocharger with glowing hot exhaust manifold, "
+        "engraved text 'FLUX.2 [dev] Turbo by fal' on the compressor housing and 'fal' on the turbine wheel, "
+        "gradient heat glow from orange to electric blue , studio lighting with dramatic shadows, "
+        "shallow depth of field, engineering blueprint pattern in background."
+    )
+
+    return call_api(
+        prompt=prompt,
+        name="flux2_turbo_lora",
+        num_inference_steps=8,
+        guidance_scale=2.5,
+        sigmas=TURBO_SIGMAS,
+        width=1024,
+        height=1024,
+        seed=42,
+    )
+
+
+if __name__ == "__main__":
+    print("=" * 80)
+    print("Testing FLUX.2 Turbo LoRA Model Serving")
+    print("=" * 80)
+    test_flux2_turbo_lora()
+    print("=" * 80)
+    print("Done")
+    print("=" * 80)