update diffusers, resolve 8bit quant memory leak #14

Teriks · Teriks · commit 7a5562b0f8d9 · 2025-04-15T23:59:36.000-05:00
pipeline components with 8 bit bitsandbytes quant layers
cannot free VRAM without being garbage collected, i.e. they
cannot be moved on to the CPU

Avoid caching them entirely, except for the last called pipeline
which is kept for sequential calls and destroyed when a differing
pipeline is called

DiffusionPipelineWrapper.__LAST_CALLED cannot hang onto the wrapper
reference without interfering with memory management (modules get pinned
on the GPU for 8bit quant because they cannot be moved with .to()),
instead create API to recall the last used main or secondary pipeline
using a factory which may return a direct reference to it from dgenerates object cache,
or re-create it using its intial arguments. This avoids perma pinning the self._pipeline
reference used for lazy init inside DiffusionPipelineWrapper and leaking
VRAM, in particular with 8bit quant modules.
diff --git a/dgenerate/batchprocess/configrunner.py b/dgenerate/batchprocess/configrunner.py
@@ -317,12 +317,20 @@ def _clear_object_cache(args: collections.abc.Sequence[str]):
     @staticmethod
     def _list_object_caches(args: collections.abc.Sequence[str]):
         """
-        List object cache names that may be cleared with \\clear_object_cache.
+        List object cache names (and memory footprint if applicable) that may be cleared with \\clear_object_cache.
         """
 
-        _messages.log('Object cache names:\n')
+        _messages.log('Object caches:\n')
+
         for object_cache in _memoize.get_object_cache_names():
-            _messages.log(' ' * 4 + '"' + object_cache + '"')
+            bin = _memoize.get_object_cache(object_cache)
+            if isinstance(bin, _memory.SizedConstrainedObjectCache):
+                _messages.log(
+                    ' ' * 4 + '"' + object_cache +
+                    f'": {len(bin)} objects, cpu side RAM - {_memory.bytes_best_human_unit(bin.size)}'
+                )
+            else:
+                _messages.log(' ' * 4 + '"' + object_cache + f'": {len(bin)} objects')
 
         return 0
 
diff --git a/dgenerate/imageprocessors/adetailer.py b/dgenerate/imageprocessors/adetailer.py
@@ -341,10 +341,10 @@ def _adetailer(self, image):
         if self._pipe:
             last_pipe = self._pipe
         else:
-            last_pipe = _pipelinewrapper.DiffusionPipelineWrapper.last_called_wrapper()
+            last_pipe = _pipelinewrapper.DiffusionPipelineWrapper.recall_last_used_main_pipeline()
             if last_pipe is not None:
                 # we only want the primary pipe, not the sdxl refiner for instance
-                last_pipe = last_pipe.recall_main_pipeline().pipeline
+                last_pipe = last_pipe.pipeline
 
         if last_pipe is None:
             raise self.argument_error(
diff --git a/dgenerate/memoize.py b/dgenerate/memoize.py
@@ -118,6 +118,9 @@ def values(self):
         """
         return list(self.__cache.values())
 
+    def __len__(self):
+        return len(self.__cache)
+
     def clear(self, collect=True):
         """
         Clear the cache and trigger callbacks.
diff --git a/dgenerate/pipelinewrapper/pipelines.py b/dgenerate/pipelinewrapper/pipelines.py
@@ -1911,6 +1911,18 @@ def _enforce_torch_pipeline_cache_size(new_pipeline_size):
         new_object_size=new_pipeline_size)
 
 
+def _check_for_8bit_bnb_quant_uris(uris: list):
+    for uri in uris:
+        if uri is None:
+            continue
+        uri_obj = _uris.get_quantizer_uri_class(uri).parse(uri)
+        if isinstance(uri_obj, _uris.BNBQuantizerUri):
+            if uri_obj.bits == 8:
+                return True
+
+    return False
+
+
 @_memoize(_torch_pipeline_cache,
           exceptions={'local_files_only'},
           hasher=_torch_args_hasher,
@@ -2623,6 +2635,17 @@ def _handle_generic_pipeline_load_failure(e):
 
     _messages.debug_log(f'Finished Creating Torch Pipeline: "{pipeline_class.__name__}"')
 
+    # modules quantized in 8 bit by bitsandbytes cannot be moved off the GPU,
+    # which results in VRAM memory leaks in dgenerates caching system, just
+    # do not cache these pipelines for anything more than repeated calls, the
+    # only way they get removed from VRAM is if their reference count is zero
+    bnb_8bit_components = _check_for_8bit_bnb_quant_uris(
+        [quantizer_uri, unet_uri, transformer_uri] +
+        [u.quantizer for u in uri_quant_check])
+
+    if bnb_8bit_components:
+        _messages.debug_log(f'Pipeline has 8bit bnb components, not entering cache: "{pipeline_class.__name__}"')
+
     # noinspection PyTypeChecker
     return TorchPipelineCreationResult(
         model_path=model_path,
@@ -2636,7 +2659,7 @@ def _handle_generic_pipeline_load_failure(e):
         parsed_textual_inversion_uris=parsed_textual_inversion_uris,
         parsed_controlnet_uris=parsed_controlnet_uris,
         parsed_t2i_adapter_uris=parsed_t2i_adapter_uris
-    ), _d_memoize.CachedObjectMetadata(size=estimated_memory_usage)
+    ), _d_memoize.CachedObjectMetadata(size=estimated_memory_usage, skip=bnb_8bit_components)
 
 
 __all__ = _types.module_all()
diff --git a/dgenerate/pipelinewrapper/wrapper.py b/dgenerate/pipelinewrapper/wrapper.py
@@ -190,17 +190,36 @@ class DiffusionPipelineWrapper:
     Monolithic diffusion pipelines wrapper.
     """
 
-    __LAST_CALLED = None
+    __LAST_RECALL_PIPELINE: _pipelines.TorchPipelineFactory = None
+    __LAST_RECALL_SECONDARY_PIPELINE: _pipelines.TorchPipelineFactory = None
 
     @staticmethod
-    def last_called_wrapper() -> typing.Optional['DiffusionPipelineWrapper']:
+    def recall_last_used_main_pipeline() -> typing.Optional[_pipelines.TorchPipelineCreationResult]:
         """
-        Return a reference to the last :py:class:`DiffusionPipelineWrapper`
-        that successfully executed an image generation.
+        Return a reference to the last :py:class:`dgenerate.pipelinewrapper.pipelines.TorchPipelineCreationResult`
+        for the pipeline that successfully executed an image generation.
 
-        :return: :py:class:`DiffusionPipelineWrapper`
+        This may recreate the pipeline if it is not cached.
+
+        If no image generation has occurred, this will return ``None``.
+
+        :return: :py:class:`dgenerate.pipelinewrapper.pipelines.TorchPipelineCreationResult` or ``None``
+        """
+        return DiffusionPipelineWrapper.__LAST_RECALL_PIPELINE()
+
+    @staticmethod
+    def recall_last_used_secondary_pipeline() -> typing.Optional[_pipelines.TorchPipelineCreationResult]:
+        """
+        Return a reference to the last :py:class:`dgenerate.pipelinewrapper.pipelines.TorchPipelineCreationResult`
+        for the secondary pipeline (refiner / stable cascade decoder) that successfully executed an image generation.
+
+        This may recreate the pipeline if it is not cached.
+
+        If no image generation has occurred or no secondary pipeline has been called, this will return ``None``.
+
+        :return: :py:class:`dgenerate.pipelinewrapper.pipelines.TorchPipelineCreationResult` or ``None``
         """
-        return DiffusionPipelineWrapper.__LAST_CALLED
+        return DiffusionPipelineWrapper.__LAST_RECALL_SECONDARY_PIPELINE()
 
     def __str__(self):
         return f'{self.__class__.__name__}({str(_types.get_public_attributes(self))})'
@@ -524,7 +543,7 @@ def _init(
         self._pipeline_type = None
         self._local_files_only = local_files_only
         self._recall_main_pipeline = None
-        self._recall_refiner_pipeline = None
+        self._recall_secondary_pipeline = None
         self._model_extra_modules = model_extra_modules
         self._second_model_extra_modules = second_model_extra_modules
         self._model_cpu_offload = model_cpu_offload
@@ -2642,23 +2661,23 @@ def recall_main_pipeline(self) -> _pipelines.PipelineCreationResult:
 
         return self._recall_main_pipeline()
 
-    def recall_refiner_pipeline(self) -> _pipelines.PipelineCreationResult:
+    def recall_secondary_pipeline(self) -> _pipelines.PipelineCreationResult:
         """
-        Fetch the last used refiner pipeline creation result, possibly the
-        pipeline will be recreated if no longer in the in memory cache.
-        If there is no refiner pipeline currently created, which will be the
-        case if an image was never generated yet or a refiner model was not
+        Fetch the last used refiner / stable cascade decoder pipeline creation result,
+        possibly the pipeline will be recreated if no longer in the in memory cache.
+        If there is no refiner / decoder pipeline currently created, which will be the
+        case if an image was never generated yet or a refiner / decoder model was not
         specified, :py:exc:`RuntimeError` will be raised.
 
         :raises RuntimeError:
 
         :return: :py:class:`dgenerate.pipelinewrapper.PipelineCreationResult`
         """
 
-        if self._recall_refiner_pipeline is None:
+        if self._recall_secondary_pipeline is None:
             raise RuntimeError('Cannot recall refiner pipeline as one has not been created.')
 
-        return self._recall_refiner_pipeline()
+        return self._recall_secondary_pipeline()
 
     def _lazy_init_pipeline(self, args: DiffusionArguments):
 
@@ -2703,7 +2722,7 @@ def _lazy_init_pipeline(self, args: DiffusionArguments):
         self._pipeline_type = pipeline_type
 
         self._recall_main_pipeline = None
-        self._recall_refiner_pipeline = None
+        self._recall_secondary_pipeline = None
 
         if self._parsed_adetailer_detector_uris:
             pipeline_type = _enums.PipelineType.INPAINT
@@ -2739,7 +2758,7 @@ def _lazy_init_pipeline(self, args: DiffusionArguments):
             creation_result = self._recall_main_pipeline()
             self._pipeline = creation_result.pipeline
 
-            self._recall_s_cascade_decoder_pipeline = _pipelines.TorchPipelineFactory(
+            self._recall_secondary_pipeline = _pipelines.TorchPipelineFactory(
                 model_path=self._parsed_s_cascade_decoder_uri.model,
                 model_type=_enums.ModelType.TORCH_S_CASCADE_DECODER,
                 pipeline_type=_enums.PipelineType.TXT2IMG,
@@ -2764,7 +2783,7 @@ def _lazy_init_pipeline(self, args: DiffusionArguments):
                 model_cpu_offload=self._second_model_cpu_offload,
                 sequential_cpu_offload=self._second_model_sequential_offload)
 
-            creation_result = self._recall_s_cascade_decoder_pipeline()
+            creation_result = self._recall_secondary_pipeline()
             self._s_cascade_decoder_pipeline = creation_result.pipeline
 
         elif self._sdxl_refiner_uri is not None:
@@ -2822,7 +2841,7 @@ def _lazy_init_pipeline(self, args: DiffusionArguments):
             else:
                 refiner_extra_modules = self._second_model_extra_modules
 
-            self._recall_refiner_pipeline = _pipelines.TorchPipelineFactory(
+            self._recall_secondary_pipeline = _pipelines.TorchPipelineFactory(
                 model_path=self._parsed_sdxl_refiner_uri.model,
                 model_type=_enums.ModelType.TORCH_SDXL,
                 pipeline_type=refiner_pipeline_type,
@@ -2848,7 +2867,7 @@ def _lazy_init_pipeline(self, args: DiffusionArguments):
                 model_cpu_offload=self._second_model_cpu_offload,
                 sequential_cpu_offload=self._second_model_sequential_offload
             )
-            self._sdxl_refiner_pipeline = self._recall_refiner_pipeline().pipeline
+            self._sdxl_refiner_pipeline = self._recall_secondary_pipeline().pipeline
         else:
             self._recall_main_pipeline = _pipelines.TorchPipelineFactory(
                 model_path=self._model_path,
@@ -3189,7 +3208,8 @@ def __call__(self, args: DiffusionArguments | None = None, **kwargs) -> Pipeline
             result = self._call_torch(pipeline_args=pipeline_args,
                                       user_args=copy_args)
 
-        DiffusionPipelineWrapper.__LAST_CALLED = self
+        DiffusionPipelineWrapper.__LAST_RECALL_PIPELINE = self._recall_main_pipeline
+        DiffusionPipelineWrapper.__LAST_RECALL_SECONDARY_PIPELINE = self._recall_secondary_pipeline
 
         return result
 
diff --git a/poetry/poetry.lock b/poetry/poetry.lock
diff --git a/poetry/pyproject.toml b/poetry/pyproject.toml