william-murray1204
diff --git a/‎README.md‎
Lines changed: 38 additions & 9 deletions b/‎README.md‎
Lines changed: 38 additions & 9 deletions
diff --git a/‎assets/mask.png‎
3.05 KB b/‎assets/mask.png‎
3.05 KB
diff --git a/‎stable_diffusion_cpp/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎stable_diffusion_cpp/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎stable_diffusion_cpp/stable_diffusion.py‎
Lines changed: 68 additions & 39 deletions b/‎stable_diffusion_cpp/stable_diffusion.py‎
Lines changed: 68 additions & 39 deletions
diff --git a/‎stable_diffusion_cpp/stable_diffusion_cpp.py‎
Lines changed: 5 additions & 1 deletion b/‎stable_diffusion_cpp/stable_diffusion_cpp.py‎
Lines changed: 5 additions & 1 deletion
@@ -42,12 +42,12 @@ All `stable-diffusion.cpp` cmake build options can be set via the `CMAKE_ARGS` e
 
 ```bash
 # Linux and Mac
-CMAKE_ARGS="-DSD_CUBLAS=ON" pip install stable-diffusion-cpp-python
+CMAKE_ARGS="-DSD_CUDA=ON" pip install stable-diffusion-cpp-python
 ```
 
 ```powershell
 # Windows
-$env:CMAKE_ARGS="-DSD_CUBLAS=ON"
+$env:CMAKE_ARGS="-DSD_CUDA=ON"
 pip install stable-diffusion-cpp-python
 ```
 
@@ -60,13 +60,13 @@ They can also be set via `pip install -C / --config-settings` command and saved
 
 ```bash
 pip install --upgrade pip # ensure pip is up to date
-pip install stable-diffusion-cpp-python -C cmake.args="-DSD_CUBLAS=ON"
+pip install stable-diffusion-cpp-python -C cmake.args="-DSD_CUDA=ON"
 ```
 
 ```txt
 # requirements.txt
 
-stable-diffusion-cpp-python -C cmake.args="-DSD_CUBLAS=ON"
+stable-diffusion-cpp-python -C cmake.args="-DSD_CUDA=ON"
 ```
 
 </details>
@@ -75,16 +75,16 @@ stable-diffusion-cpp-python -C cmake.args="-DSD_CUBLAS=ON"
 
 Below are some common backends, their build commands and any additional environment variables required.
 
-<!-- CUBLAS -->
+<!-- CUDA -->
 <details>
-<summary>Using CUBLAS (CUDA)</summary>
+<summary>Using CUDA (CUBLAS)</summary>
 
 This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure you have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). You can check your installed CUDA toolkit version by running `nvcc --version`.
 
 - It is recommended you have at least 4 GB of VRAM.
 
 ```bash
-CMAKE_ARGS="-DSD_CUBLAS=ON" pip install stable-diffusion-cpp-python
+CMAKE_ARGS="-DSD_CUDA=ON" pip install stable-diffusion-cpp-python
 ```
 
 </details>
@@ -148,7 +148,7 @@ CMAKE_ARGS="-DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML
 <details>
 <summary>Using Flash Attention</summary>
 
-Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
+Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUDA (CUBLAS) is enabled because the kernel implementation is missing.
 
 ```bash
 CMAKE_ARGS="-DSD_FLASH_ATTN=ON" pip install stable-diffusion-cpp-python
@@ -166,6 +166,19 @@ CMAKE_ARGS="-DGGML_OPENBLAS=ON" pip install stable-diffusion-cpp-python
 
 </details>
 
+<!-- MUSA -->
+
+<details>
+<summary>Using MUSA</summary>
+
+This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```bash
+CMAKE_ARGS="-DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release" pip install stable-diffusion-cpp-python
+```
+
+</details>
+
 ### Upgrading and Reinstalling
 
 To upgrade and rebuild `stable-diffusion-cpp-python` add `--upgrade --force-reinstall --no-cache-dir` flags to the `pip install` command to ensure the package is rebuilt from source.
@@ -299,7 +312,23 @@ stable_diffusion = StableDiffusion(model_path="../models/v1-5-pruned-emaonly.saf
 
 output = stable_diffusion.img_to_img(
       prompt="blue eyes",
-      image=INPUT_IMAGE,
+      image=INPUT_IMAGE, # Note: The input image will be automatically resized to the match the width and height arguments (default: 512x512)
+      strength=0.4,
+)
+```
+
+### Inpainting
+
+```python
+from stable_diffusion_cpp import StableDiffusion
+
+# Note: Inpainting with a base model gives poor results. A model fine-tuned for inpainting is recommended.
+stable_diffusion = StableDiffusion(model_path="../models/v1-5-pruned-emaonly.safetensors")
+
+output = stable_diffusion.img_to_img(
+      prompt="blue eyes",
+      image="../input.png",
+      mask_image="../mask.png", # A grayscale image where 0 is masked and 255 is unmasked
       strength=0.4,
 )
 ```
 
@@ -4,4 +4,4 @@
 
 # isort: on
 
-__version__ = "0.2.2"
+__version__ = "0.2.3"
@@ -33,9 +33,9 @@ def __init__(
         vae_decode_only: bool = False,
         vae_tiling: bool = False,
         n_threads: int = -1,
-        wtype: Union[str, GGMLType, int, float, None] = "default",
-        rng_type: Union[str, RNGType, int, float, None] = "cuda",
-        schedule: Union[str, Schedule, int, float, None] = "default",
+        wtype: Optional[Union[str, GGMLType, int, float]] = "default",
+        rng_type: Optional[Union[str, RNGType, int, float]] = "cuda",
+        schedule: Optional[Union[str, Schedule, int, float]] = "default",
         keep_clip_on_cpu: bool = False,
         keep_control_net_cpu: bool = False,
         keep_vae_on_cpu: bool = False,
@@ -189,7 +189,7 @@ def txt_to_img(
         guidance: float = 3.5,
         width: int = 512,
         height: int = 512,
-        sample_method: Union[str, SampleMethod, int, float, None] = "euler_a",
+        sample_method: Optional[Union[str, SampleMethod, int, float]] = "euler_a",
         sample_steps: int = 20,
         seed: int = 42,
         batch_count: int = 1,
@@ -315,13 +315,14 @@ def img_to_img(
         self,
         image: Union[Image.Image, str],
         prompt: str,
+        mask_image: Optional[Union[Image.Image, str]] = None,
         negative_prompt: str = "",
         clip_skip: int = -1,
         cfg_scale: float = 7.0,
         guidance: float = 3.5,
         width: int = 512,
         height: int = 512,
-        sample_method: Union[str, SampleMethod, int, float, None] = "euler_a",
+        sample_method: Optional[Union[str, SampleMethod, int, float]] = "euler_a",
         sample_steps: int = 20,
         strength: float = 0.75,
         seed: int = 42,
@@ -344,6 +345,7 @@ def img_to_img(
         Args:
             image: The input image path or Pillow Image to direct the generation.
             prompt: The prompt to render.
+            mask_image: The inpainting mask image path or Pillow Image.
             negative_prompt: The negative prompt.
             clip_skip: Ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer.
             cfg_scale: Unconditional guidance scale.
@@ -414,8 +416,25 @@ def sd_progress_callback(
         # Resize the input image
         image = self._resize_image(image, width, height)  # Input image and generated image must have the same size
 
-        # Convert the image to a byte array
+        def _create_blank_mask_image(width: int, height: int):
+            """Create a blank white mask image in c_unit8 format."""
+            mask_image_buffer = (ctypes.c_uint8 * (width * height))(*[255] * (width * height))
+            return mask_image_buffer
+
+        # Convert the image and mask image to a byte array
         image_pointer = self._image_to_sd_image_t_p(image)
+        if mask_image:
+            # Resize the mask image (however the mask should ideally already be the same size as the input image)
+            mask_image = self._resize_image(mask_image, width, height)
+            mask_image_pointer = self._image_to_sd_image_t_p(mask_image, channel=1)
+        else:
+            # Create a blank white mask image
+            mask_image_pointer = self._c_uint8_to_sd_image_t_p(
+                image=_create_blank_mask_image(width, height),
+                width=width,
+                height=height,
+                channel=1,
+            )
 
         # Convert skip_layers to a ctypes array
         skip_layers_array = (ctypes.c_int * len(skip_layers))(*skip_layers)
@@ -426,6 +445,7 @@ def sd_progress_callback(
             c_images = sd_cpp.img2img(
                 self.model,
                 image_pointer,
+                mask_image_pointer,
                 prompt.encode("utf-8"),
                 negative_prompt.encode("utf-8"),
                 clip_skip,
@@ -466,7 +486,7 @@ def img_to_vid(
         augmentation_level: float = 0.0,
         min_cfg: float = 1.0,
         cfg_scale: float = 7.0,
-        sample_method: Union[str, SampleMethod, int, float, None] = "euler_a",
+        sample_method: Optional[Union[str, SampleMethod, int, float]] = "euler_a",
         sample_steps: int = 20,
         strength: float = 0.75,
         seed: int = 42,
@@ -661,7 +681,6 @@ def sd_progress_callback(
         # ==================== Upscale images ====================
 
         upscaled_images = []
-
         for image in images:
 
             # Convert the image to a byte array
@@ -698,19 +717,24 @@ def _resize_image(self, image: Union[Image.Image, str], width: int, height: int)
     def _format_image(
         self,
         image: Union[Image.Image, str],
+        channel: int = 3,
     ) -> Image.Image:
-        """Convert an image path or Pillow Image to a Pillow Image of RGBA format."""
+        """Convert an image path or Pillow Image to a Pillow Image of RGBA or grayscale (inpainting masks) format."""
         # Convert image path to image if str
         if isinstance(image, str):
             image = Image.open(image)
 
-        # Convert any non RGBA to RGBA
-        if image.format != "PNG":
-            image = image.convert("RGBA")
+        if channel == 1:
+            # Grayscale the image if channel is 1
+            image = image.convert("L")
+        else:
+            # Convert any non RGBA to RGBA
+            if image.format != "PNG":
+                image = image.convert("RGBA")
 
-        # Ensure the image is in RGB mode
-        if image.mode != "RGB":
-            image = image.convert("RGB")
+            # Ensure the image is in RGB mode
+            if image.mode != "RGB":
+                image = image.convert("RGB")
 
         return image, image.width, image.height
 
@@ -741,14 +765,12 @@ def _format_control_cond(
 
     # ============= Image to C uint8 pointer =============
 
-    def _cast_image(self, image: Union[Image.Image, str]):
+    def _cast_image(self, image: Union[Image.Image, str], channel: int = 3):
         """Cast a PIL Image to a C uint8 pointer."""
-
-        image, width, height = self._format_image(image)
+        image, width, height = self._format_image(image, channel)
 
         # Convert the PIL Image to a byte array
         image_bytes = image.tobytes()
-
         data = ctypes.cast(
             (ctypes.c_byte * len(image_bytes))(*image_bytes),
             ctypes.POINTER(ctypes.c_uint8),
@@ -757,8 +779,8 @@ def _cast_image(self, image: Union[Image.Image, str]):
 
     # ============= Image to C sd_image_t =============
 
-    def _c_uint8_to_sd_image_t_p(self, image: ctypes.c_uint8, width, height, channel: int = 3):
-        # Create a new C sd_image_t
+    def _c_uint8_to_sd_image_t_p(self, image: ctypes.c_uint8, width: int, height: int, channel: int = 3) -> sd_cpp.sd_image_t:
+        """Convert a C uint8 pointer to a C sd_image_t."""
         c_image = sd_cpp.sd_image_t(
             width=width,
             height=height,
@@ -767,21 +789,18 @@ def _c_uint8_to_sd_image_t_p(self, image: ctypes.c_uint8, width, height, channel
         )
         return c_image
 
-    def _image_to_sd_image_t_p(self, image: Union[Image.Image, str]):
+    def _image_to_sd_image_t_p(self, image: Union[Image.Image, str], channel: int = 3) -> sd_cpp.sd_image_t:
         """Convert a PIL Image or image path to a C sd_image_t."""
-
-        data, width, height = self._cast_image(image)
-
-        # Create a new C sd_image_t
-        c_image = self._c_uint8_to_sd_image_t_p(data, width, height)
+        data, width, height = self._cast_image(image, channel)
+        c_image = self._c_uint8_to_sd_image_t_p(data, width, height, channel)
         return c_image
 
     # ============= C sd_image_t to Image =============
 
-    def _c_array_to_bytes(self, c_array, buffer_size: int):
+    def _c_array_to_bytes(self, c_array, buffer_size: int) -> bytes:
         return bytearray(ctypes.cast(c_array, ctypes.POINTER(ctypes.c_byte * buffer_size)).contents)
 
-    def _dereference_sd_image_t_p(self, c_image: sd_cpp.sd_image_t):
+    def _dereference_sd_image_t_p(self, c_image: sd_cpp.sd_image_t) -> Dict:
         """Dereference a C sd_image_t pointer to a Python dictionary with height, width, channel and data (bytes)."""
 
         # Calculate the size of the data buffer
@@ -795,7 +814,7 @@ def _dereference_sd_image_t_p(self, c_image: sd_cpp.sd_image_t):
         }
         return image
 
-    def _image_slice(self, c_images: sd_cpp.sd_image_t, count: int, upscale_factor: int):
+    def _image_slice(self, c_images: sd_cpp.sd_image_t, count: int, upscale_factor: int) -> List[Dict]:
         """Slice a C array of images."""
         image_array = ctypes.cast(c_images, ctypes.POINTER(sd_cpp.sd_image_t * count)).contents
 
@@ -821,7 +840,7 @@ def _image_slice(self, c_images: sd_cpp.sd_image_t, count: int, upscale_factor:
         # Return the list of images
         return images
 
-    def _sd_image_t_p_to_images(self, c_images: sd_cpp.sd_image_t, count: int, upscale_factor: int):
+    def _sd_image_t_p_to_images(self, c_images: sd_cpp.sd_image_t, count: int, upscale_factor: int) -> List[Image.Image]:
         """Convert C sd_image_t_p images to a Python list of images."""
 
         # Convert C array to Python list of images
@@ -836,20 +855,30 @@ def _sd_image_t_p_to_images(self, c_images: sd_cpp.sd_image_t, count: int, upsca
 
     # ============= Bytes to Image =============
 
-    def _bytes_to_image(self, byte_data: bytes, width: int, height: int):
+    def _bytes_to_image(self, byte_data: bytes, width: int, height: int, channel: int = 3) -> Image.Image:
         """Convert a byte array to a PIL Image."""
+        # Initialize the image with RGBA mode
         image = Image.new("RGBA", (width, height))
 
         for y in range(height):
             for x in range(width):
-                idx = (y * width + x) * 3
-                image.putpixel(
-                    (x, y),
-                    (byte_data[idx], byte_data[idx + 1], byte_data[idx + 2], 255),
-                )
+                idx = (y * width + x) * channel
+                # Dynamically create the color tuple
+                color = tuple(byte_data[idx + i] if idx + i < len(byte_data) else 0 for i in range(channel))
+                if channel == 1:  # Grayscale
+                    color = (color[0],) * 3 + (255,)  # Convert to (R, G, B, A)
+                elif channel == 3:  # RGB
+                    color = color + (255,)  # Add alpha channel
+                elif channel == 4:  # RGBA
+                    pass  # Use color as is
+                else:
+                    raise ValueError(f"Unsupported channel value: {channel}")
+                # Set the pixel
+                image.putpixel((x, y), color)
+
         return image
 
-    def __setstate__(self, state):
+    def __setstate__(self, state) -> None:
         self.__init__(**state)
 
     def close(self) -> None:
@@ -865,7 +894,7 @@ def __del__(self) -> None:
 # ============================================
 
 
-def validate_dimensions(dimension: int | float, attribute_name: str) -> int:
+def validate_dimensions(dimension: Union[int, float], attribute_name: str) -> int:
     """Dimensions must be a multiple of 64 otherwise a GGML_ASSERT error is encountered."""
     dimension = int(dimension)
     if dimension <= 0 or dimension % 64 != 0:
 
@@ -253,6 +253,8 @@ class GGMLType(IntEnum):
     SD_TYPE_F16 = 1
     SD_TYPE_Q4_0 = 2
     SD_TYPE_Q4_1 = 3
+    # SD_TYPE_Q4_2 = 4 support has been removed
+    # SD_TYPE_Q4_3 = 5 support has been removed
     SD_TYPE_Q5_0 = 6
     SD_TYPE_Q5_1 = 7
     SD_TYPE_Q8_0 = 8
@@ -444,12 +446,13 @@ def txt2img(
 # ------------ img2img ------------
 
 
-# SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, const char* prompt, const char* negative_prompt, int clip_skip, float cfg_scale, float guidance, int width, int height, enum sample_method_t sample_method, int sample_steps, float strength, int64_t seed, int batch_count, const sd_image_t* control_cond, float control_strength, float style_strength, bool normalize_input, const char* input_id_images_path, int* skip_layers, size_t skip_layers_count, float slg_scale, float skip_layer_start, float skip_layer_end);
+# SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, sd_image_t mask_image, const char* prompt, const char* negative_prompt, int clip_skip, float cfg_scale, float guidance, int width, int height, enum sample_method_t sample_method, int sample_steps, float strength, int64_t seed, int batch_count, const sd_image_t* control_cond, float control_strength, float style_strength, bool normalize_input, const char* input_id_images_path, int* skip_layers, size_t skip_layers_count, float slg_scale, float skip_layer_start, float skip_layer_end);
 @ctypes_function(
     "img2img",
     [
         sd_ctx_t_p_ctypes,  # sd_ctx
         sd_image_t,  # init_image
+        sd_image_t, # mask_image
         ctypes.c_char_p,  # prompt
         ctypes.c_char_p,  # negative_prompt
         ctypes.c_int,  # clip_skip
@@ -478,6 +481,7 @@ def txt2img(
 def img2img(
     sd_ctx: sd_ctx_t_p,
     init_image: sd_image_t,
+    mask_image: sd_image_t,
     prompt: bytes,
     negative_prompt: bytes,
     clip_skip: int,
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@`
`4`	`4`
`5`	`5`	`# isort: on`
`6`	`6`
`7`		`-__version__ = "0.2.2"`
	`7`	`+__version__ = "0.2.3"`