Refactor: Flexible sd3 arch

stduhpf · stduhpf · commit e7eabd3ac6b4 · 2024-11-29T01:44:36.000+01:00
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -83,9 +83,8 @@ struct MMDiTModel : public DiffusionModel {
     MMDiTRunner mmdit;
 
     MMDiTModel(ggml_backend_t backend,
-               std::map<std::string, enum ggml_type>& tensor_types,
-               SDVersion version = VERSION_SD3_2B)
-        : mmdit(backend, tensor_types, "model.diffusion_model", version) {
+               std::map<std::string, enum ggml_type>& tensor_types)
+        : mmdit(backend, tensor_types, "model.diffusion_model") {
     }
 
     void alloc_params_buffer() {
diff --git a/mmdit.hpp b/mmdit.hpp
@@ -637,7 +637,6 @@ struct FinalLayer : public GGMLBlock {
 struct MMDiT : public GGMLBlock {
     // Diffusion model with a Transformer backbone.
 protected:
-    SDVersion version                = VERSION_SD3_2B;
     int64_t input_size               = -1;
     int64_t patch_size               = 2;
     int64_t in_channels              = 16;
@@ -659,8 +658,7 @@ struct MMDiT : public GGMLBlock {
     }
 
 public:
-    MMDiT(SDVersion version = VERSION_SD3_2B)
-        : version(version) {
+    MMDiT(std::map<std::string, enum ggml_type>& tensor_types) {
         // input_size is always None
         // learn_sigma is always False
         // register_length is alwalys 0
@@ -672,48 +670,44 @@ struct MMDiT : public GGMLBlock {
         // pos_embed_scaling_factor is not used
         // pos_embed_offset is not used
         // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}
-        if (version == VERSION_SD3_2B) {
-            input_size               = -1;
-            patch_size               = 2;
-            in_channels              = 16;
-            depth                    = 24;
-            mlp_ratio                = 4.0f;
-            adm_in_channels          = 2048;
-            out_channels             = 16;
-            pos_embed_max_size       = 192;
-            num_patchs               = 36864;  // 192 * 192
-            context_size             = 4096;
-            context_embedder_out_dim = 1536;
-        } else if (version == VERSION_SD3_5_8B) {
-            input_size               = -1;
-            patch_size               = 2;
-            in_channels              = 16;
-            depth                    = 38;
-            mlp_ratio                = 4.0f;
-            adm_in_channels          = 2048;
-            out_channels             = 16;
-            pos_embed_max_size       = 192;
-            num_patchs               = 36864;  // 192 * 192
-            context_size             = 4096;
-            context_embedder_out_dim = 2432;
-            qk_norm                  = "rms";
-        } else if (version == VERSION_SD3_5_2B) {
-            input_size               = -1;
-            patch_size               = 2;
-            in_channels              = 16;
-            depth                    = 24;
-            d_self                   = 12;
-            mlp_ratio                = 4.0f;
-            adm_in_channels          = 2048;
-            out_channels             = 16;
-            pos_embed_max_size       = 384;
-            num_patchs               = 147456;
-            context_size             = 4096;
-            context_embedder_out_dim = 1536;
-            qk_norm                  = "rms";
+
+        // read tensors from tensor_types
+        for (auto pair : tensor_types) {
+            std::string tensor_name = pair.first;
+            if (tensor_name.find("model.diffusion_model.") == std::string::npos)
+                continue;
+            size_t jb = tensor_name.find("joint_blocks.");
+            if (jb != std::string::npos) {
+                tensor_name = tensor_name.substr(jb);  // remove prefix
+                int block_depth = atoi(tensor_name.substr(13, tensor_name.find(".", 13)).c_str());
+                if (block_depth + 1 > depth) {
+                    depth = block_depth + 1;
+                }
+                if (tensor_name.find("attn.ln") != std::string::npos) {
+                    if (tensor_name.find(".bias") != std::string::npos) {
+                        qk_norm = "ln";
+                    } else {
+                        qk_norm = "rms";
+                    }
+                }
+                if (tensor_name.find("attn2") != std::string::npos) {
+                    if (block_depth > d_self) {
+                        d_self = block_depth;
+                    }
+                }
+            }
         }
+
+        if (d_self >= 0) {
+            pos_embed_max_size *= 2;
+            num_patchs *= 4;
+        }
+
+        LOG_INFO("MMDiT layers: %d (including %d MMDiT-x layers)", depth, d_self + 1);
+
         int64_t default_out_channels = in_channels;
         hidden_size                  = 64 * depth;
+        context_embedder_out_dim     = 64 * depth;
         int64_t num_heads            = depth;
 
         blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(input_size, patch_size, in_channels, hidden_size, true));
@@ -879,9 +873,8 @@ struct MMDiTRunner : public GGMLRunner {
 
     MMDiTRunner(ggml_backend_t backend,
                 std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
-                const std::string prefix                            = "",
-                SDVersion version                                   = VERSION_SD3_2B)
-        : GGMLRunner(backend), mmdit(version) {
+                const std::string prefix                            = "")
+        : GGMLRunner(backend), mmdit(tensor_types) {
         mmdit.init(params_ctx, tensor_types, prefix);
     }
 
diff --git a/model.cpp b/model.cpp
@@ -1462,7 +1462,6 @@ SDVersion ModelLoader::get_sd_version() {
     bool is_flux    = false;
     bool is_schnell = true;
     bool is_lite    = true;
-    bool is_sd3     = false;
     for (auto& tensor_storage : tensor_storages) {
         if (tensor_storage.name.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
             is_schnell = false;
@@ -1473,14 +1472,8 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name.find("model.diffusion_model.double_blocks.8") != std::string::npos) {
             is_lite = false;
         }
-        if (tensor_storage.name.find("joint_blocks.0.x_block.attn2.ln_q.weight") != std::string::npos) {
-            return VERSION_SD3_5_2B;
-        }
-        if (tensor_storage.name.find("joint_blocks.37.x_block.attn.ln_q.weight") != std::string::npos) {
-            return VERSION_SD3_5_8B;
-        }
-        if (tensor_storage.name.find("model.diffusion_model.joint_blocks.23.") != std::string::npos) {
-            is_sd3 = true;
+        if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
+            return VERSION_SD3;
         }
         if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
             return VERSION_SDXL;
@@ -1512,9 +1505,6 @@ SDVersion ModelLoader::get_sd_version() {
             return VERSION_FLUX_DEV;
         }
     }
-    if (is_sd3) {
-        return VERSION_SD3_2B;
-    }
     if (token_embedding_weight.ne[0] == 768) {
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
diff --git a/model.h b/model.h
@@ -22,11 +22,9 @@ enum SDVersion {
     VERSION_SD2,
     VERSION_SDXL,
     VERSION_SVD,
-    VERSION_SD3_2B,
+    VERSION_SD3,
     VERSION_FLUX_DEV,
     VERSION_FLUX_SCHNELL,
-    VERSION_SD3_5_8B,
-    VERSION_SD3_5_2B,
     VERSION_FLUX_LITE,
     VERSION_COUNT,
 };
@@ -39,7 +37,7 @@ static inline bool sd_version_is_flux(SDVersion version) {
 }
 
 static inline bool sd_version_is_sd3(SDVersion version) {
-    if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
+    if (version == VERSION_SD3) {
         return true;
     }
     return false;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -29,11 +29,9 @@ const char* model_version_to_str[] = {
     "SD 2.x",
     "SDXL",
     "SVD",
-    "SD3 2B",
+    "SD3.x",
     "Flux Dev",
     "Flux Schnell",
-    "SD3.5 8B",
-    "SD3.5 2B",
     "Flux Lite 8B"};
 
 const char* sampling_methods_str[] = {
@@ -330,7 +328,7 @@ class StableDiffusionGGML {
                     LOG_WARN("flash attention in this diffusion model is currently unsupported!");
                 }
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
-                diffusion_model  = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types, version);
+                diffusion_model  = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
                 cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
                 diffusion_model  = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);