feat: add generation parameters to PIL image.info, update submodule

william-murray1204 · william-murray1204 · commit cb5c0cee4e10 · 2025-08-19T20:52:26.000+10:00
diff --git a/README.md b/README.md
@@ -267,8 +267,8 @@ stable_diffusion = StableDiffusion(
 )
 output = stable_diffusion.generate_image(
       prompt="a lovely cat",
-      width=512, # Must be a multiple of 64
-      height=512, # Must be a multiple of 64
+      width=512,
+      height=512,
       progress_callback=callback,
       # seed=1337, # Uncomment to set a specific seed (use -1 for a random seed)
 )
diff --git a/stable_diffusion_cpp/_internals.py b/stable_diffusion_cpp/_internals.py
@@ -39,6 +39,8 @@ def __init__(
         keep_control_net_on_cpu: bool,
         keep_vae_on_cpu: bool,
         diffusion_flash_attn: bool,
+        diffusion_conv_direct: bool,
+        vae_conv_direct: bool,
         chroma_use_dit_mask: bool,
         chroma_use_t5_mask: bool,
         chroma_t5_mask_pad: int,
@@ -69,6 +71,8 @@ def __init__(
             keep_control_net_on_cpu=keep_control_net_on_cpu,
             keep_vae_on_cpu=keep_vae_on_cpu,
             diffusion_flash_attn=diffusion_flash_attn,
+            diffusion_conv_direct=diffusion_conv_direct,
+            vae_conv_direct=vae_conv_direct,
             chroma_use_dit_mask=chroma_use_dit_mask,
             chroma_use_t5_mask=chroma_use_t5_mask,
             chroma_t5_mask_pad=chroma_t5_mask_pad,
@@ -127,10 +131,12 @@ def __init__(
         self,
         upscaler_path: str,
         n_threads: int,
+        diffusion_conv_direct: bool,
         verbose: bool,
     ):
         self.upscaler_path = upscaler_path
         self.n_threads = n_threads
+        self.diffusion_conv_direct = diffusion_conv_direct
         self.verbose = verbose
         self._exit_stack = ExitStack()
 
@@ -146,7 +152,11 @@ def __init__(
                 raise ValueError(f"Upscaler model path does not exist: {upscaler_path}")
 
             # Load the image upscaling model ctx
-            self.upscaler = sd_cpp.new_upscaler_ctx(upscaler_path.encode("utf-8"), self.n_threads)
+            self.upscaler = sd_cpp.new_upscaler_ctx(
+                upscaler_path.encode("utf-8"),
+                self.n_threads,
+                self.diffusion_conv_direct,
+            )
 
             # Check if the model was loaded successfully
             if self.upscaler is None:
diff --git a/stable_diffusion_cpp/stable_diffusion.py b/stable_diffusion_cpp/stable_diffusion.py
@@ -40,6 +40,8 @@ def __init__(
         keep_control_net_on_cpu: bool = False,
         keep_vae_on_cpu: bool = False,
         diffusion_flash_attn: bool = False,
+        diffusion_conv_direct: bool = False,
+        vae_conv_direct: bool = False,
         chroma_use_dit_mask: bool = True,
         chroma_use_t5_mask: bool = False,
         chroma_t5_mask_pad: int = 1,
@@ -80,6 +82,8 @@ def __init__(
             keep_control_net_on_cpu: Keep controlnet in CPU (for low vram).
             keep_vae_on_cpu: Keep vae in CPU (for low vram).
             diffusion_flash_attn: Use flash attention in diffusion model (can reduce memory usage significantly). May lower quality or crash if backend not supported.
+            diffusion_conv_direct: Use Conv2d direct in the diffusion model. May crash if backend not supported.
+            vae_conv_direct: Use Conv2d direct in the vae model (should improve performance). May crash if backend not supported.
             chroma_use_dit_mask: Use DiT mask for chroma.
             chroma_use_t5_mask: Use T5 mask for chroma.
             chroma_t5_mask_pad: T5 mask padding size of chroma.
@@ -114,6 +118,8 @@ def __init__(
         self.keep_control_net_on_cpu = keep_control_net_on_cpu
         self.keep_vae_on_cpu = keep_vae_on_cpu
         self.diffusion_flash_attn = diffusion_flash_attn
+        self.diffusion_conv_direct = diffusion_conv_direct
+        self.vae_conv_direct = vae_conv_direct
         self.chroma_use_dit_mask = chroma_use_dit_mask
         self.chroma_use_t5_mask = chroma_use_t5_mask
         self.chroma_t5_mask_pad = chroma_t5_mask_pad
@@ -160,6 +166,8 @@ def __init__(
                     keep_control_net_on_cpu=self.keep_control_net_on_cpu,
                     keep_vae_on_cpu=self.keep_vae_on_cpu,
                     diffusion_flash_attn=self.diffusion_flash_attn,
+                    diffusion_conv_direct=self.diffusion_conv_direct,
+                    vae_conv_direct=self.vae_conv_direct,
                     chroma_use_dit_mask=self.chroma_use_dit_mask,
                     chroma_use_t5_mask=self.chroma_use_t5_mask,
                     chroma_t5_mask_pad=self.chroma_t5_mask_pad,
@@ -175,6 +183,7 @@ def __init__(
                 _UpscalerModel(
                     upscaler_path=upscaler_path,
                     n_threads=self.n_threads,
+                    diffusion_conv_direct=self.diffusion_conv_direct,
                     verbose=self.verbose,
                 )
             )
@@ -276,7 +285,7 @@ def generate_image(
 
         sample_method = validate_and_set_input(sample_method, SAMPLE_METHOD_MAP, "sample_method")
 
-        # Ensure dimensions are multiples of 64
+        # Ensure valid dimensions
         width = validate_dimensions(width, "width")
         height = validate_dimensions(height, "height")
 
@@ -407,7 +416,42 @@ def _create_blank_mask_image(width: int, height: int):
             )
 
         # Convert the C array of images to a Python list of images
-        return self._sd_image_t_p_to_images(c_images, batch_count, upscale_factor)
+        images = self._sd_image_t_p_to_images(c_images, batch_count, upscale_factor)
+
+        # Attach metadata safely
+        for i, image in enumerate(images):
+            image.info.update(
+                {
+                    # Generation Parameters
+                    "prompt": prompt,
+                    "negative_prompt": negative_prompt,
+                    "seed": seed + i if batch_count > 1 else seed,  # Increment seed for each image in batch
+                    "sample_steps": sample_steps,
+                    "sample_method": sample_method,
+                    "cfg_scale": cfg_scale,
+                    "slg_scale": slg_scale,
+                    "skip_layers": skip_layers,
+                    "skip_layer_start": skip_layer_start,
+                    "skip_layer_end": skip_layer_end,
+                    "guidance": guidance,
+                    "eta": eta,
+                    "width": width,
+                    "height": height,
+                    # Model Context Parameters
+                    "model_path": self.model_path,
+                    "diffusion_model_path": self.diffusion_model_path,
+                    "vae_path": self.vae_path,
+                    "clip_l_path": self.clip_l_path,
+                    "clip_g_path": self.clip_g_path,
+                    "t5xxl_path": self.t5xxl_path,
+                    "taesd_path": self.taesd_path,
+                    "control_net_path": self.control_net_path,
+                    "rng_type": self.rng_type,
+                    "clip_skip": clip_skip,
+                }
+            )
+
+        return images
 
     # ============================================
     # Generate Video
@@ -476,7 +520,7 @@ def generate_video(
 
         # sample_method = validate_and_set_input(sample_method, SAMPLE_METHOD_MAP, "sample_method")
 
-        # # Ensure dimensions are multiples of 64
+        # # Ensure valid dimensions
         # width = validate_dimensions(width, "width")
         # height = validate_dimensions(height, "height")
 
@@ -865,10 +909,9 @@ def __del__(self) -> None:
 
 
 def validate_dimensions(dimension: Union[int, float], attribute_name: str) -> int:
-    """Dimensions must be a multiple of 64 otherwise a GGML_ASSERT error is encountered."""
     dimension = int(dimension)
-    if dimension <= 0 or dimension % 64 != 0:
-        raise ValueError(f"The '{attribute_name}' must be a multiple of 64.")
+    if dimension <= 0:
+        raise ValueError(f"The '{attribute_name}' must be greater than 0.")
     return dimension
 
 
diff --git a/stable_diffusion_cpp/stable_diffusion_cpp.py b/stable_diffusion_cpp/stable_diffusion_cpp.py
@@ -308,7 +308,7 @@ class GGMLType(IntEnum):
 # ------------ sd_ctx_params_t ------------
 
 
-# typedef struct { const char* model_path; const char* clip_l_path; const char* clip_g_path; const char* t5xxl_path; const char* diffusion_model_path; const char* vae_path; const char* taesd_path; const char* control_net_path; const char* lora_model_dir; const char* embedding_dir; const char* stacked_id_embed_dir; bool vae_decode_only; bool vae_tiling; bool free_params_immediately; int n_threads; enum sd_type_t wtype; enum rng_type_t rng_type; enum schedule_t schedule; bool keep_clip_on_cpu; bool keep_control_net_on_cpu; bool keep_vae_on_cpu; bool diffusion_flash_attn; bool chroma_use_dit_mask; bool chroma_use_t5_mask; int chroma_t5_mask_pad; } sd_ctx_params_t;
+# typedef struct { const char* model_path; const char* clip_l_path; const char* clip_g_path; const char* t5xxl_path; const char* diffusion_model_path; const char* vae_path; const char* taesd_path; const char* control_net_path; const char* lora_model_dir; const char* embedding_dir; const char* stacked_id_embed_dir; bool vae_decode_only; bool vae_tiling; bool free_params_immediately; int n_threads; enum sd_type_t wtype; enum rng_type_t rng_type; enum schedule_t schedule; bool keep_clip_on_cpu; bool keep_control_net_on_cpu; bool keep_vae_on_cpu; bool diffusion_flash_attn; bool diffusion_conv_direct; bool vae_conv_direct; bool chroma_use_dit_mask; bool chroma_use_t5_mask; int chroma_t5_mask_pad; } sd_ctx_params_t;
 class sd_ctx_params_t(ctypes.Structure):
     _fields_ = [
         ("model_path", ctypes.c_char_p),
@@ -333,6 +333,8 @@ class sd_ctx_params_t(ctypes.Structure):
         ("keep_control_net_on_cpu", ctypes.c_bool),
         ("keep_vae_on_cpu", ctypes.c_bool),
         ("diffusion_flash_attn", ctypes.c_bool),
+        ("diffusion_conv_direct", ctypes.c_bool),
+        ("vae_conv_direct", ctypes.c_bool),
         ("chroma_use_dit_mask", ctypes.c_bool),
         ("chroma_use_t5_mask", ctypes.c_bool),
         ("chroma_t5_mask_pad", ctypes.c_int),
@@ -532,18 +534,20 @@ class upscaler_ctx_t(ctypes.Structure):
 # ------------ new_upscaler_ctx ------------
 
 
-# SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, int n_threads);
+# SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, int n_threads, bool direct);
 @ctypes_function(
     "new_upscaler_ctx",
     [
         ctypes.c_char_p,  # esrgan_path
         ctypes.c_int,  # n_threads
+        ctypes.c_bool,  # direct
     ],
     upscaler_ctx_t_p_ctypes,
 )
 def new_upscaler_ctx(
     esrgan_path: bytes,
     n_threads: int,
+    direct: bool,
     /,
 ) -> upscaler_ctx_t_p: ...
 
diff --git a/tests/test_txt2img.py b/tests/test_txt2img.py
@@ -9,6 +9,19 @@
 
 LORA_DIR = "C:\\stable-diffusion\\loras"
 
+PROMPTS = [
+    # {"add": "_lora", "prompt": "a lovely cat <lora:realism_lora:1>"},  # With LORA
+    # {"add": "", "prompt": "a lovely cat"},  # Without LORA
+    # {"add": "_lora", "prompt": "a cute cat glass statue <lora:glass_statue_v1:1>"},  # With LORA
+    {"add": "", "prompt": "a cute cat glass statue"},  # Without LORA
+]
+STEPS = 4
+
+OUTPUT_DIR = "tests/outputs"
+if not os.path.exists(OUTPUT_DIR):
+    os.makedirs(OUTPUT_DIR)
+
+
 stable_diffusion = StableDiffusion(
     model_path=MODEL_PATH,
     lora_model_dir=LORA_DIR,
@@ -20,22 +33,11 @@ def callback(step: int, steps: int, time: float):
 
 
 try:
-    prompts = [
-        # {"add": "_lora", "prompt": "a lovely cat <lora:realism_lora:1>"},  # With LORA
-        # {"add": "", "prompt": "a lovely cat"},  # Without LORA
-        {"add": "_lora", "prompt": "a cute cat glass statue <lora:glass_statue_v1:1>"},  # With LORA
-        {"add": "", "prompt": "a cute cat glass statue"},  # Without LORA
-    ]
-
-    OUTPUT_DIR = "tests/outputs"
-    if not os.path.exists(OUTPUT_DIR):
-        os.makedirs(OUTPUT_DIR)
-
-    for prompt in prompts:
+    for prompt in PROMPTS:
         # Generate images
         images = stable_diffusion.generate_image(
             prompt=prompt["prompt"],
-            sample_steps=4,
+            sample_steps=STEPS,
             progress_callback=callback,
         )
 
@@ -46,3 +48,29 @@ def callback(step: int, steps: int, time: float):
 except Exception as e:
     traceback.print_exc()
     print("Test - txt2img failed: ", e)
+
+# # ======== C++ CLI ========
+
+# import subprocess
+
+# stable_diffusion = None  # Clear model
+
+# SD_CPP_CLI = "C:\\Users\\Willi\\Documents\\GitHub\\stable-diffusion.cpp\\build\\bin\\sd"
+
+# for prompt in PROMPTS:
+#     cli_cmd = [
+#         SD_CPP_CLI,
+#         "--model",
+#         MODEL_PATH,
+#         "--lora-model-dir",
+#         LORA_DIR,
+#         "--prompt",
+#         prompt["prompt"],
+#         "--steps",
+#         str(STEPS),
+#         "--output",
+#         f"{OUTPUT_DIR}/txt2img{prompt['add']}_cli.png",
+#         "-v",
+#     ]
+#     print(" ".join(cli_cmd))
+#     subprocess.run(cli_cmd, check=True)
diff --git a/vendor/stable-diffusion.cpp b/vendor/stable-diffusion.cpp
@@ -1 +1 @@
-Subproject commit eed97a5e1d054f9c1e7ac01982ae480411d4157e
+Subproject commit 5900ef6605c6fbf7934239f795c13c97bc993853

Original file line number	Diff line number	Diff line change
`@@ -267,8 +267,8 @@ stable_diffusion = StableDiffusion(`
`267`	`267`	`)`
`268`	`268`	`output = stable_diffusion.generate_image(`
`269`	`269`	`prompt="a lovely cat",`
`270`		`- width=512, # Must be a multiple of 64`
`271`		`- height=512, # Must be a multiple of 64`
	`270`	`+ width=512,`
	`271`	`+ height=512,`
`272`	`272`	`progress_callback=callback,`
`273`	`273`	`# seed=1337, # Uncomment to set a specific seed (use -1 for a random seed)`
`274`	`274`	`)`