From 2ce65a88b5e1688905809fb99582507baf67bd1e Mon Sep 17 00:00:00 2001 From: "yanjun.qiu" Date: Sat, 11 Oct 2025 06:01:59 +0000 Subject: [PATCH 1/7] upgrade cache-dit api --- README.md | 46 ++++++++++++++++++++++------------------- cache_config.yaml | 7 +++---- utils/pipeline_utils.py | 9 ++++---- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index c653509..3c1d050 100644 --- a/README.md +++ b/README.md @@ -682,34 +682,38 @@ image = pipe(prompt, num_inference_steps=4).images[0] You can use `cache-dit` to further speedup FLUX model, different configurations of compute blocks (F12B12, etc.) can be customized in cache-dit: DBCache. Please check [cache-dit](https://github.com/vipshop/cache-dit) for more details. For example: ```python -# Install: pip install -U cache-dit +# Install: pip3 install git+https://github.com/vipshop/cache-dit.git +import cache_dit from diffusers import FluxPipeline -from cache_dit.cache_factory import apply_cache_on_pipe, CacheType pipeline = FluxPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, ).to("cuda") -# cache-dit: DBCache configs -cache_options = { - "cache_type": CacheType.DBCache, - "warmup_steps": 0, - "max_cached_steps": -1, # -1 means no limit - "Fn_compute_blocks": 1, # Fn, F1, F12, etc. - "Bn_compute_blocks": 0, # Bn, B0, B12, etc. - "residual_diff_threshold": 0.12, - # TaylorSeer options - "enable_taylorseer": True, - "enable_encoder_taylorseer": True, - # Taylorseer cache type cache be hidden_states or residual - "taylorseer_cache_type": "residual", - "taylorseer_kwargs": { - "n_derivatives": 2, - }, -} - -apply_cache_on_pipe(pipeline, **cache_options) +# Default options, F8B0, 8 warmup steps, and unlimited cached +# steps for good balance between performance and precision +cache_dit.enable_cache(pipe_or_adapter) + +# Or using custom options via cache configs +from cache_dit import BasicCacheConfig, TaylorSeerCalibratorConfig + +cache_dit.enable_cache( + pipeline, + # Basic DBCache w/ FnBn configurations + cache_config=BasicCacheConfig( + max_warmup_steps=0, # steps do not cache + max_cached_steps=-1, # -1 means no limit + Fn_compute_blocks=1, # Fn, F1, etc. + Bn_compute_blocks=0, # Bn, B0, etc. + residual_diff_threshold=0.12, + ), + # Then, you can use the TaylorSeer Calibrator to approximate + # the values in cached steps, taylorseer_order default is 1. + calibrator_config=TaylorSeerCalibratorConfig( + taylorseer_order=1, + ), +) ``` By the way, `cache-dit` is designed to work compatibly with torch.compile. You can easily use `cache-dit` with torch.compile to further achieve a better performance. For example: diff --git a/cache_config.yaml b/cache_config.yaml index 844e1d9..09a784d 100644 --- a/cache_config.yaml +++ b/cache_config.yaml @@ -1,11 +1,10 @@ -cache_type: DBCache -warmup_steps: 0 +max_warmup_steps: 0 max_cached_steps: -1 +max_continuous_cached_steps: 2 Fn_compute_blocks: 1 Bn_compute_blocks: 0 residual_diff_threshold: 0.12 enable_taylorseer: true enable_encoder_taylorseer: true taylorseer_cache_type: residual -taylorseer_kwargs: - n_derivatives: 2 +taylorseer_order: 2 \ No newline at end of file diff --git a/utils/pipeline_utils.py b/utils/pipeline_utils.py index c40b111..e19d8f4 100644 --- a/utils/pipeline_utils.py +++ b/utils/pipeline_utils.py @@ -407,12 +407,11 @@ def optimize(pipeline, args): ) try: # docs: https://github.com/vipshop/cache-dit - from cache_dit.cache_factory import apply_cache_on_pipe - from cache_dit.cache_factory import load_cache_options_from_yaml - cache_options = load_cache_options_from_yaml( - args.cache_dit_config + import cache_dit + + cache_dit.enable_cache( + pipeline, **cache_dit.load_options(args.cache_dit_config), ) - apply_cache_on_pipe(pipeline, **cache_options) except ImportError as e: print( "You have passed the '--cache_dit_config' flag, but we cannot " From 579a82ca602ab0c558690033e409ca23c16b2d4e Mon Sep 17 00:00:00 2001 From: "yanjun.qiu" Date: Sat, 11 Oct 2025 06:10:14 +0000 Subject: [PATCH 2/7] upgrade cache-dit api --- run_benchmark.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/run_benchmark.py b/run_benchmark.py index a897a86..97790ae 100644 --- a/run_benchmark.py +++ b/run_benchmark.py @@ -54,6 +54,10 @@ def main(args): print('time mean/var:', timings, timings.mean().item(), timings.var().item()) image.save(args.output_file) + if args.cache_dit_config is not None: + import cache_dit + cache_dit.summary(pipeline) + # optionally generate PyTorch Profiler trace # this is done after benchmarking because tracing introduces overhead if args.trace_file is not None: From 0a0f26cb2a8df9f280e291368395bf61eb016235 Mon Sep 17 00:00:00 2001 From: "yanjun.qiu" Date: Sat, 11 Oct 2025 06:13:54 +0000 Subject: [PATCH 3/7] upgrade cache-dit api --- run_benchmark.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/run_benchmark.py b/run_benchmark.py index 97790ae..6298e5b 100644 --- a/run_benchmark.py +++ b/run_benchmark.py @@ -55,8 +55,11 @@ def main(args): image.save(args.output_file) if args.cache_dit_config is not None: - import cache_dit - cache_dit.summary(pipeline) + try: + import cache_dit + cache_dit.summary(pipeline) + except ImportError: + print("cache-dit not installed, please install it to see cache-dit summary") # optionally generate PyTorch Profiler trace # this is done after benchmarking because tracing introduces overhead From c1e44ad4981a84c42614cc1be21b7064003ee25c Mon Sep 17 00:00:00 2001 From: "yanjun.qiu" Date: Sat, 11 Oct 2025 06:20:52 +0000 Subject: [PATCH 4/7] upgrade cache-dit api --- cache_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cache_config.yaml b/cache_config.yaml index 09a784d..2e8b939 100644 --- a/cache_config.yaml +++ b/cache_config.yaml @@ -3,7 +3,7 @@ max_cached_steps: -1 max_continuous_cached_steps: 2 Fn_compute_blocks: 1 Bn_compute_blocks: 0 -residual_diff_threshold: 0.12 +residual_diff_threshold: 0.30 enable_taylorseer: true enable_encoder_taylorseer: true taylorseer_cache_type: residual From 6a7ea73c3576ee21aa97727d37fe5a0c27518f2f Mon Sep 17 00:00:00 2001 From: "yanjun.qiu" Date: Sat, 11 Oct 2025 06:21:29 +0000 Subject: [PATCH 5/7] upgrade cache-dit api --- cache_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cache_config.yaml b/cache_config.yaml index 2e8b939..09a784d 100644 --- a/cache_config.yaml +++ b/cache_config.yaml @@ -3,7 +3,7 @@ max_cached_steps: -1 max_continuous_cached_steps: 2 Fn_compute_blocks: 1 Bn_compute_blocks: 0 -residual_diff_threshold: 0.30 +residual_diff_threshold: 0.12 enable_taylorseer: true enable_encoder_taylorseer: true taylorseer_cache_type: residual From de10c1475626664ad74192c776a2d103dfef62ef Mon Sep 17 00:00:00 2001 From: DefTruth Date: Sat, 11 Oct 2025 06:24:03 +0000 Subject: [PATCH 6/7] upgrade cache-dit api --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c1d050..b7c923b 100644 --- a/README.md +++ b/README.md @@ -682,7 +682,7 @@ image = pipe(prompt, num_inference_steps=4).images[0] You can use `cache-dit` to further speedup FLUX model, different configurations of compute blocks (F12B12, etc.) can be customized in cache-dit: DBCache. Please check [cache-dit](https://github.com/vipshop/cache-dit) for more details. For example: ```python -# Install: pip3 install git+https://github.com/vipshop/cache-dit.git +# Install: pip install git+https://github.com/vipshop/cache-dit.git import cache_dit from diffusers import FluxPipeline From e1bf2df36fb4eee862325b23a8e4f7106e33dcbc Mon Sep 17 00:00:00 2001 From: DefTruth Date: Sat, 11 Oct 2025 06:26:24 +0000 Subject: [PATCH 7/7] upgrade cache-dit api --- cache_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cache_config.yaml b/cache_config.yaml index 09a784d..2e8b939 100644 --- a/cache_config.yaml +++ b/cache_config.yaml @@ -3,7 +3,7 @@ max_cached_steps: -1 max_continuous_cached_steps: 2 Fn_compute_blocks: 1 Bn_compute_blocks: 0 -residual_diff_threshold: 0.12 +residual_diff_threshold: 0.30 enable_taylorseer: true enable_encoder_taylorseer: true taylorseer_cache_type: residual