diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..63924a0b --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,10 @@ +Checks: > + modernize-make-shared, + modernize-use-nullptr, + modernize-use-override, + modernize-pass-by-value, + modernize-return-braced-init-list, + modernize-deprecated-headers, +HeaderFilterRegex: '^$' +WarningsAsErrors: '' +FormatStyle: none \ No newline at end of file diff --git a/clip.hpp b/clip.hpp index 12d9f4f6..296ca9aa 100644 --- a/clip.hpp +++ b/clip.hpp @@ -550,7 +550,7 @@ class CLIPEmbeddings : public GGMLBlock { int64_t num_positions; bool force_clip_f32; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { enum ggml_type token_wtype = GGML_TYPE_F32; if (!force_clip_f32) { token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32); @@ -587,7 +587,7 @@ class CLIPEmbeddings : public GGMLBlock { GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]); input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]); - auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids); + auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids); token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]); // token_embedding + position_embedding @@ -606,7 +606,7 @@ class CLIPVisionEmbeddings : public GGMLBlock { int64_t image_size; int64_t num_patches; int64_t num_positions; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { enum ggml_type patch_wtype = GGML_TYPE_F16; enum ggml_type class_wtype = GGML_TYPE_F32; enum ggml_type position_wtype = GGML_TYPE_F32; @@ -641,10 +641,10 @@ class CLIPVisionEmbeddings : public GGMLBlock { // concat(patch_embedding, class_embedding) + position_embedding struct ggml_tensor* patch_embedding; int64_t N = pixel_values->ne[3]; - patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] - patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] - patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] - patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] + patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] + patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] + patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] + patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N); class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim] @@ -669,7 +669,7 @@ enum CLIPVersion { class CLIPTextModel : public GGMLBlock { protected: - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { if (version == OPEN_CLIP_VIT_BIGG_14) { enum ggml_type wtype = GGML_TYPE_F32; params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size); @@ -735,8 +735,8 @@ class CLIPTextModel : public GGMLBlock { if (return_pooled) { auto text_projection = params["text_projection"]; ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx); - if (text_projection != NULL) { - pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL); + if (text_projection != nullptr) { + pooled = ggml_nn_linear(ctx, pooled, text_projection, nullptr); } else { LOG_DEBUG("identity projection"); } @@ -814,7 +814,7 @@ class CLIPProjection : public UnaryBlock { int64_t out_features; bool transpose_weight; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); if (transpose_weight) { params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features); @@ -831,12 +831,12 @@ class CLIPProjection : public UnaryBlock { out_features(out_features), transpose_weight(transpose_weight) {} - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* w = params["weight"]; if (transpose_weight) { w = ggml_cont(ctx, ggml_transpose(ctx, w)); } - return ggml_nn_linear(ctx, x, w, NULL); + return ggml_nn_linear(ctx, x, w, nullptr); } }; @@ -894,7 +894,7 @@ struct CLIPTextModelRunner : public GGMLRunner { model.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return "clip"; } @@ -921,7 +921,7 @@ struct CLIPTextModelRunner : public GGMLRunner { struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, int num_custom_embeddings = 0, - void* custom_embeddings_data = NULL, + void* custom_embeddings_data = nullptr, size_t max_token_idx = 0, bool return_pooled = false, int clip_skip = -1) { @@ -929,9 +929,9 @@ struct CLIPTextModelRunner : public GGMLRunner { input_ids = to_backend(input_ids); - struct ggml_tensor* embeddings = NULL; + struct ggml_tensor* embeddings = nullptr; - if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) { + if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) { auto token_embed_weight = model.get_token_embed_weight(); auto custom_embeddings = ggml_new_tensor_2d(compute_ctx, token_embed_weight->type, @@ -958,7 +958,7 @@ struct CLIPTextModelRunner : public GGMLRunner { bool return_pooled, int clip_skip, ggml_tensor** output, - ggml_context* output_ctx = NULL) { + ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; diff --git a/common.hpp b/common.hpp index d3216714..7cc95d5b 100644 --- a/common.hpp +++ b/common.hpp @@ -121,7 +121,7 @@ class ResBlock : public GGMLBlock { } } - virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) { + virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) { // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml // [N, c, t, h, w] => [N, c, t, h * w] // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w] @@ -131,7 +131,7 @@ class ResBlock : public GGMLBlock { auto out_layers_0 = std::dynamic_pointer_cast(blocks["out_layers.0"]); auto out_layers_3 = std::dynamic_pointer_cast(blocks["out_layers.3"]); - if (emb == NULL) { + if (emb == nullptr) { GGML_ASSERT(skip_t_emb); } @@ -182,7 +182,7 @@ class GEGLU : public UnaryBlock { int64_t dim_in; int64_t dim_out; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32); enum ggml_type bias_wtype = GGML_TYPE_F32; params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2); @@ -193,7 +193,7 @@ class GEGLU : public UnaryBlock { GEGLU(int64_t dim_in, int64_t dim_out) : dim_in(dim_in), dim_out(dim_out) {} - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [ne3, ne2, ne1, dim_in] // return: [ne3, ne2, ne1, dim_out] struct ggml_tensor* w = params["proj.weight"]; @@ -222,7 +222,7 @@ class GELU : public UnaryBlock { blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out, bias)); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [ne3, ne2, ne1, dim_in] // return: [ne3, ne2, ne1, dim_out] auto proj = std::dynamic_pointer_cast(blocks["proj"]); @@ -325,7 +325,7 @@ class CrossAttention : public GGMLBlock { auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] - x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim] + x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, nullptr, false, false, flash_attn); // [N, n_token, inner_dim] x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] return x; @@ -483,7 +483,7 @@ class SpatialTransformer : public GGMLBlock { class AlphaBlender : public GGMLBlock { protected: - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override { // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix enum ggml_type wtype = GGML_TYPE_F32; params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); diff --git a/conditioner.hpp b/conditioner.hpp index 4f9efb8c..e299d367 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -6,9 +6,9 @@ #include "t5.hpp" struct SDCondition { - struct ggml_tensor* c_crossattn = NULL; // aka context - struct ggml_tensor* c_vector = NULL; // aka y - struct ggml_tensor* c_concat = NULL; + struct ggml_tensor* c_crossattn = nullptr; // aka context + struct ggml_tensor* c_vector = nullptr; // aka y + struct ggml_tensor* c_concat = nullptr; SDCondition() = default; SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat) @@ -79,28 +79,28 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model"); if (sd_version_is_sdxl(version)) { text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model"); } } - void alloc_params_buffer() { + void alloc_params_buffer() override { text_model->alloc_params_buffer(); if (sd_version_is_sdxl(version)) { text_model2->alloc_params_buffer(); } } - void free_params_buffer() { + void free_params_buffer() override { text_model->free_params_buffer(); if (sd_version_is_sdxl(version)) { text_model2->free_params_buffer(); } } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { size_t buffer_size = text_model->get_params_buffer_size(); if (sd_version_is_sdxl(version)) { buffer_size += text_model2->get_params_buffer_size(); @@ -121,11 +121,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } struct ggml_init_params params; params.mem_size = 100 * 1024 * 1024; // max for custom embeddings 100 MB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* embd_ctx = ggml_init(params); - struct ggml_tensor* embd = NULL; - struct ggml_tensor* embd2 = NULL; + struct ggml_tensor* embd = nullptr; + struct ggml_tensor* embd2 = nullptr; auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) { if (tensor_storage.ne[0] != text_model->model.hidden_size) { if (text_model2) { @@ -404,11 +404,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int adm_in_channels = -1, bool zero_out_masked = false) { int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size] - struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] - struct ggml_tensor* chunk_hidden_states1 = NULL; // [n_token, hidden_size] - struct ggml_tensor* chunk_hidden_states2 = NULL; // [n_token, hidden_size2] - struct ggml_tensor* pooled = NULL; + struct ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size] + struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] + struct ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size] + struct ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2] + struct ggml_tensor* pooled = nullptr; std::vector hidden_states_vec; if (clip_skip <= 0) { @@ -424,7 +424,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { weights.begin() + (chunk_idx + 1) * chunk_len); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - struct ggml_tensor* input_ids2 = NULL; + struct ggml_tensor* input_ids2 = nullptr; size_t max_token_idx = 0; if (sd_version_is_sdxl(version)) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID); @@ -512,7 +512,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { chunk_hidden_states->ne[0], ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - ggml_tensor* vec = NULL; + ggml_tensor* vec = nullptr; if (sd_version_is_sdxl(version)) { int out_dim = 256; vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels); @@ -549,13 +549,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { GGML_ASSERT(offset == ggml_nbytes(vec)); } // print_ggml_tensor(result); - return SDCondition(hidden_states, vec, NULL); + return {hidden_states, vec, nullptr}; } std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx, int n_threads, - const ConditionerParams& conditioner_params) { + const ConditionerParams& conditioner_params) override { auto image_tokens = convert_token_to_id(trigger_word); // if(image_tokens.size() == 1){ // printf(" image token id is: %d \n", image_tokens[0]); @@ -589,7 +589,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } std::string remove_trigger_from_prompt(ggml_context* work_ctx, - const std::string& prompt) { + const std::string& prompt) override { auto image_tokens = convert_token_to_id(trigger_word); GGML_ASSERT(image_tokens.size() == 1); auto tokens_and_weights = tokenize(prompt, false); @@ -602,7 +602,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { SDCondition get_learned_condition(ggml_context* work_ctx, int n_threads, - const ConditionerParams& conditioner_params) { + const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, true); std::vector& tokens = tokens_and_weights.first; std::vector& weights = tokens_and_weights.second; @@ -628,7 +628,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner { vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer"); } - std::string get_desc() { + std::string get_desc() override { return "clip_vision"; } @@ -678,25 +678,25 @@ struct SD3CLIPEmbedder : public Conditioner { t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model"); t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); } - void alloc_params_buffer() { + void alloc_params_buffer() override { clip_l->alloc_params_buffer(); clip_g->alloc_params_buffer(); t5->alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { clip_l->free_params_buffer(); clip_g->free_params_buffer(); t5->free_params_buffer(); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { size_t buffer_size = clip_l->get_params_buffer_size(); buffer_size += clip_g->get_params_buffer_size(); buffer_size += t5->get_params_buffer_size(); @@ -747,7 +747,7 @@ struct SD3CLIPEmbedder : public Conditioner { clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding); clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding); - t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding); + t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding); // for (int i = 0; i < clip_l_tokens.size(); i++) { // std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", "; @@ -784,14 +784,14 @@ struct SD3CLIPEmbedder : public Conditioner { } int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096] - struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096] - struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l] - struct ggml_tensor* chunk_hidden_states_g = NULL; // [n_token, hidden_size_g] - struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5] - struct ggml_tensor* pooled = NULL; - struct ggml_tensor* pooled_l = NULL; // [768,] - struct ggml_tensor* pooled_g = NULL; // [1280,] + struct ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096] + struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096] + struct ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l] + struct ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g] + struct ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5] + struct ggml_tensor* pooled = nullptr; + struct ggml_tensor* pooled_l = nullptr; // [768,] + struct ggml_tensor* pooled_g = nullptr; // [1280,] std::vector hidden_states_vec; size_t chunk_len = 77; @@ -810,7 +810,7 @@ struct SD3CLIPEmbedder : public Conditioner { clip_l->compute(n_threads, input_ids, 0, - NULL, + nullptr, max_token_idx, false, clip_skip, @@ -838,7 +838,7 @@ struct SD3CLIPEmbedder : public Conditioner { clip_l->compute(n_threads, input_ids, 0, - NULL, + nullptr, max_token_idx, true, clip_skip, @@ -860,7 +860,7 @@ struct SD3CLIPEmbedder : public Conditioner { clip_g->compute(n_threads, input_ids, 0, - NULL, + nullptr, max_token_idx, false, clip_skip, @@ -889,7 +889,7 @@ struct SD3CLIPEmbedder : public Conditioner { clip_g->compute(n_threads, input_ids, 0, - NULL, + nullptr, max_token_idx, true, clip_skip, @@ -909,7 +909,7 @@ struct SD3CLIPEmbedder : public Conditioner { t5->compute(n_threads, input_ids, - NULL, + nullptr, &chunk_hidden_states_t5, work_ctx); { @@ -974,12 +974,12 @@ struct SD3CLIPEmbedder : public Conditioner { hidden_states, chunk_hidden_states->ne[0], ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - return SDCondition(hidden_states, pooled, NULL); + return {hidden_states, pooled, nullptr}; } SDCondition get_learned_condition(ggml_context* work_ctx, int n_threads, - const ConditionerParams& conditioner_params) { + const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, 77, true); return get_learned_condition_common(work_ctx, n_threads, @@ -1003,22 +1003,22 @@ struct FluxCLIPEmbedder : public Conditioner { t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); } - void alloc_params_buffer() { + void alloc_params_buffer() override { clip_l->alloc_params_buffer(); t5->alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { clip_l->free_params_buffer(); t5->free_params_buffer(); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { size_t buffer_size = clip_l->get_params_buffer_size(); buffer_size += t5->get_params_buffer_size(); return buffer_size; @@ -1061,7 +1061,7 @@ struct FluxCLIPEmbedder : public Conditioner { } clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding); - t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding); + t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding); // for (int i = 0; i < clip_l_tokens.size(); i++) { // std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", "; @@ -1091,9 +1091,9 @@ struct FluxCLIPEmbedder : public Conditioner { } int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] - struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096] - struct ggml_tensor* pooled = NULL; // [768,] + struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] + struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] + struct ggml_tensor* pooled = nullptr; // [768,] std::vector hidden_states_vec; size_t chunk_count = t5_tokens.size() / chunk_len; @@ -1115,7 +1115,7 @@ struct FluxCLIPEmbedder : public Conditioner { clip_l->compute(n_threads, input_ids, 0, - NULL, + nullptr, max_token_idx, true, clip_skip, @@ -1134,7 +1134,7 @@ struct FluxCLIPEmbedder : public Conditioner { t5->compute(n_threads, input_ids, - NULL, + nullptr, &chunk_hidden_states, work_ctx); { @@ -1173,12 +1173,12 @@ struct FluxCLIPEmbedder : public Conditioner { hidden_states, chunk_hidden_states->ne[0], ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); - return SDCondition(hidden_states, pooled, NULL); + return {hidden_states, pooled, nullptr}; } SDCondition get_learned_condition(ggml_context* work_ctx, int n_threads, - const ConditionerParams& conditioner_params) { + const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true); return get_learned_condition_common(work_ctx, n_threads, @@ -1206,19 +1206,19 @@ struct T5CLIPEmbedder : public Conditioner { t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); } - void alloc_params_buffer() { + void alloc_params_buffer() override { t5->alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { t5->free_params_buffer(); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { size_t buffer_size = 0; buffer_size += t5->get_params_buffer_size(); @@ -1287,9 +1287,9 @@ struct T5CLIPEmbedder : public Conditioner { auto& t5_attn_mask_vec = std::get<2>(token_and_weights); int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] - struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096] - struct ggml_tensor* pooled = NULL; + struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096] + struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096] + struct ggml_tensor* pooled = nullptr; struct ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token] std::vector hidden_states_vec; @@ -1306,7 +1306,7 @@ struct T5CLIPEmbedder : public Conditioner { t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : NULL; + auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : nullptr; t5->compute(n_threads, input_ids, @@ -1358,12 +1358,12 @@ struct T5CLIPEmbedder : public Conditioner { modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad); - return SDCondition(hidden_states, t5_attn_mask, NULL); + return {hidden_states, t5_attn_mask, nullptr}; } SDCondition get_learned_condition(ggml_context* work_ctx, int n_threads, - const ConditionerParams& conditioner_params) { + const ConditionerParams& conditioner_params) override { auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true); return get_learned_condition_common(work_ctx, n_threads, @@ -1389,19 +1389,19 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner { enable_vision); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { qwenvl->get_param_tensors(tensors, "text_encoders.qwen2vl"); } - void alloc_params_buffer() { + void alloc_params_buffer() override { qwenvl->alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { qwenvl->free_params_buffer(); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { size_t buffer_size = 0; buffer_size += qwenvl->get_params_buffer_size(); return buffer_size; @@ -1454,7 +1454,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner { SDCondition get_learned_condition(ggml_context* work_ctx, int n_threads, - const ConditionerParams& conditioner_params) { + const ConditionerParams& conditioner_params) override { std::string prompt; std::vector> image_embeds; size_t system_prompt_length = 0; @@ -1530,7 +1530,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner { auto& weights = std::get<1>(tokens_and_weights); int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = NULL; // [N, n_token, 3584] + struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584] auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); @@ -1570,7 +1570,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner { int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - return SDCondition(new_hidden_states, nullptr, nullptr); + return {new_hidden_states, nullptr, nullptr}; } }; diff --git a/control.hpp b/control.hpp index 79b82a22..1f231f93 100644 --- a/control.hpp +++ b/control.hpp @@ -206,18 +206,18 @@ class ControlNetBlock : public GGMLBlock { struct ggml_tensor* guided_hint, struct ggml_tensor* timesteps, struct ggml_tensor* context, - struct ggml_tensor* y = NULL) { + struct ggml_tensor* y = nullptr) { // x: [N, in_channels, h, w] or [N, in_channels/2, h, w] // timesteps: [N,] // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768] // y: [N, adm_in_channels] or [1, adm_in_channels] - if (context != NULL) { + if (context != nullptr) { if (context->ne[2] != x->ne[3]) { context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3])); } } - if (y != NULL) { + if (y != nullptr) { if (y->ne[1] != x->ne[3]) { y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3])); } @@ -237,7 +237,7 @@ class ControlNetBlock : public GGMLBlock { emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim] // SDXL/SVD - if (y != NULL) { + if (y != nullptr) { auto label_embed_0 = std::dynamic_pointer_cast(blocks["label_emb.0.0"]); auto label_embed_2 = std::dynamic_pointer_cast(blocks["label_emb.0.2"]); @@ -250,7 +250,7 @@ class ControlNetBlock : public GGMLBlock { std::vector outs; - if (guided_hint == NULL) { + if (guided_hint == nullptr) { guided_hint = input_hint_block_forward(ctx, hint, emb, context); } outs.push_back(guided_hint); @@ -312,10 +312,10 @@ struct ControlNet : public GGMLRunner { SDVersion version = VERSION_SD1; ControlNetBlock control_net; - ggml_backend_buffer_t control_buffer = NULL; // keep control output tensors in backend memory - ggml_context* control_ctx = NULL; + ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory + ggml_context* control_ctx = nullptr; std::vector controls; // (12 input block outputs, 1 middle block output) SD 1.5 - struct ggml_tensor* guided_hint = NULL; // guided_hint cache, for faster inference + struct ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference bool guided_hint_cached = false; ControlNet(ggml_backend_t backend, @@ -337,14 +337,14 @@ struct ControlNet : public GGMLRunner { } } - ~ControlNet() { + ~ControlNet() override { free_control_ctx(); } void alloc_control_ctx(std::vector outs) { struct ggml_init_params params; params.mem_size = static_cast(outs.size() * ggml_tensor_overhead()) + 1024 * 1024; - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = true; control_ctx = ggml_init(params); @@ -366,20 +366,20 @@ struct ControlNet : public GGMLRunner { } void free_control_ctx() { - if (control_buffer != NULL) { + if (control_buffer != nullptr) { ggml_backend_buffer_free(control_buffer); - control_buffer = NULL; + control_buffer = nullptr; } - if (control_ctx != NULL) { + if (control_ctx != nullptr) { ggml_free(control_ctx); - control_ctx = NULL; + control_ctx = nullptr; } - guided_hint = NULL; + guided_hint = nullptr; guided_hint_cached = false; controls.clear(); } - std::string get_desc() { + std::string get_desc() override { return "control_net"; } @@ -391,12 +391,12 @@ struct ControlNet : public GGMLRunner { struct ggml_tensor* hint, struct ggml_tensor* timesteps, struct ggml_tensor* context, - struct ggml_tensor* y = NULL) { + struct ggml_tensor* y = nullptr) { struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false); x = to_backend(x); if (guided_hint_cached) { - hint = NULL; + hint = nullptr; } else { hint = to_backend(hint); } @@ -408,12 +408,12 @@ struct ControlNet : public GGMLRunner { runtime_backend, x, hint, - guided_hint_cached ? guided_hint : NULL, + guided_hint_cached ? guided_hint : nullptr, timesteps, context, y); - if (control_ctx == NULL) { + if (control_ctx == nullptr) { alloc_control_ctx(outs); } @@ -431,8 +431,8 @@ struct ControlNet : public GGMLRunner { struct ggml_tensor* timesteps, struct ggml_tensor* context, struct ggml_tensor* y, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] diff --git a/denoiser.hpp b/denoiser.hpp index 3c53301b..cb2010ca 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -19,7 +19,7 @@ struct SigmaSchedule { }; struct DiscreteSchedule : SigmaSchedule { - std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { + std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { std::vector result; int t_max = TIMESTEPS - 1; @@ -43,7 +43,7 @@ struct DiscreteSchedule : SigmaSchedule { }; struct ExponentialSchedule : SigmaSchedule { - std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { + std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { std::vector sigmas; // Calculate step size @@ -150,7 +150,7 @@ std::vector log_linear_interpolation(std::vector sigma_in, https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html */ struct AYSSchedule : SigmaSchedule { - std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { + std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { const std::vector noise_levels[] = { /* SD1.5 */ {14.6146412293f, 6.4745760956f, 3.8636745985f, 2.6946151520f, @@ -204,7 +204,7 @@ struct AYSSchedule : SigmaSchedule { * GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main */ struct GITSSchedule : SigmaSchedule { - std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { + std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { if (sigma_max <= 0.0f) { return std::vector{}; } @@ -252,7 +252,7 @@ struct SGMUniformSchedule : SigmaSchedule { }; struct KarrasSchedule : SigmaSchedule { - std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { + std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { // These *COULD* be function arguments here, // but does anybody ever bother to touch them? float rho = 7.f; @@ -350,15 +350,15 @@ struct CompVisDenoiser : public Denoiser { float sigma_data = 1.0f; - float sigma_min() { + float sigma_min() override { return sigmas[0]; } - float sigma_max() { + float sigma_max() override { return sigmas[TIMESTEPS - 1]; } - float sigma_to_t(float sigma) { + float sigma_to_t(float sigma) override { float log_sigma = std::log(sigma); std::vector dists; dists.reserve(TIMESTEPS); @@ -384,7 +384,7 @@ struct CompVisDenoiser : public Denoiser { return t; } - float t_to_sigma(float t) { + float t_to_sigma(float t) override { int low_idx = static_cast(std::floor(t)); int high_idx = static_cast(std::ceil(t)); float w = t - static_cast(low_idx); @@ -392,7 +392,7 @@ struct CompVisDenoiser : public Denoiser { return std::exp(log_sigma); } - std::vector get_scalings(float sigma) { + std::vector get_scalings(float sigma) override { float c_skip = 1.0f; float c_out = -sigma; float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data); @@ -400,19 +400,19 @@ struct CompVisDenoiser : public Denoiser { } // this function will modify noise/latent - ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) { + ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { ggml_tensor_scale(noise, sigma); ggml_tensor_add(latent, noise); return latent; } - ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) { + ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { return latent; } }; struct CompVisVDenoiser : public CompVisDenoiser { - std::vector get_scalings(float sigma) { + std::vector get_scalings(float sigma) override { float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data); float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data); float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data); @@ -429,19 +429,19 @@ struct EDMVDenoiser : public CompVisVDenoiser { scheduler = std::make_shared(); } - float t_to_sigma(float t) { + float t_to_sigma(float t) override { return std::exp(t * 4 / (float)TIMESTEPS); } - float sigma_to_t(float s) { + float sigma_to_t(float s) override { return 0.25 * std::log(s); } - float sigma_min() { + float sigma_min() override { return min_sigma; } - float sigma_max() { + float sigma_max() override { return max_sigma; } }; @@ -470,24 +470,24 @@ struct DiscreteFlowDenoiser : public Denoiser { } } - float sigma_min() { + float sigma_min() override { return sigmas[0]; } - float sigma_max() { + float sigma_max() override { return sigmas[TIMESTEPS - 1]; } - float sigma_to_t(float sigma) { + float sigma_to_t(float sigma) override { return sigma * 1000.f; } - float t_to_sigma(float t) { + float t_to_sigma(float t) override { t = t + 1; return time_snr_shift(shift, t / 1000.f); } - std::vector get_scalings(float sigma) { + std::vector get_scalings(float sigma) override { float c_skip = 1.0f; float c_out = -sigma; float c_in = 1.0f; @@ -495,14 +495,14 @@ struct DiscreteFlowDenoiser : public Denoiser { } // this function will modify noise/latent - ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) { + ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { ggml_tensor_scale(noise, sigma); ggml_tensor_scale(latent, 1.0f - sigma); ggml_tensor_add(latent, noise); return latent; } - ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) { + ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { ggml_tensor_scale(latent, 1.0f / (1.0f - sigma)); return latent; } @@ -529,24 +529,24 @@ struct FluxFlowDenoiser : public Denoiser { } } - float sigma_min() { + float sigma_min() override { return sigmas[0]; } - float sigma_max() { + float sigma_max() override { return sigmas[TIMESTEPS - 1]; } - float sigma_to_t(float sigma) { + float sigma_to_t(float sigma) override { return sigma; } - float t_to_sigma(float t) { + float t_to_sigma(float t) override { t = t + 1; return flux_time_shift(shift, 1.0f, t / TIMESTEPS); } - std::vector get_scalings(float sigma) { + std::vector get_scalings(float sigma) override { float c_skip = 1.0f; float c_out = -sigma; float c_in = 1.0f; @@ -554,14 +554,14 @@ struct FluxFlowDenoiser : public Denoiser { } // this function will modify noise/latent - ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) { + ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override { ggml_tensor_scale(noise, sigma); ggml_tensor_scale(latent, 1.0f - sigma); ggml_tensor_add(latent, noise); return latent; } - ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) { + ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override { ggml_tensor_scale(latent, 1.0f / (1.0f - sigma)); return latent; } diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 6c38b58a..94b29bf1 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -8,18 +8,18 @@ #include "wan.hpp" struct DiffusionParams { - struct ggml_tensor* x = NULL; - struct ggml_tensor* timesteps = NULL; - struct ggml_tensor* context = NULL; - struct ggml_tensor* c_concat = NULL; - struct ggml_tensor* y = NULL; - struct ggml_tensor* guidance = NULL; + struct ggml_tensor* x = nullptr; + struct ggml_tensor* timesteps = nullptr; + struct ggml_tensor* context = nullptr; + struct ggml_tensor* c_concat = nullptr; + struct ggml_tensor* y = nullptr; + struct ggml_tensor* guidance = nullptr; std::vector ref_latents = {}; bool increase_ref_index = false; int num_video_frames = -1; std::vector controls = {}; float control_strength = 0.f; - struct ggml_tensor* vace_context = NULL; + struct ggml_tensor* vace_context = nullptr; float vace_strength = 1.f; std::vector skip_layers = {}; }; @@ -28,8 +28,8 @@ struct DiffusionModel { virtual std::string get_desc() = 0; virtual void compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) = 0; + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) = 0; virtual void alloc_params_buffer() = 0; virtual void free_params_buffer() = 0; virtual void free_compute_buffer() = 0; @@ -49,38 +49,38 @@ struct UNetModel : public DiffusionModel { : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) { } - std::string get_desc() { + std::string get_desc() override { return unet.get_desc(); } - void alloc_params_buffer() { + void alloc_params_buffer() override { unet.alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { unet.free_params_buffer(); } - void free_compute_buffer() { + void free_compute_buffer() override { unet.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { unet.get_param_tensors(tensors, "model.diffusion_model"); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { return unet.get_params_buffer_size(); } - int64_t get_adm_in_channels() { + int64_t get_adm_in_channels() override { return unet.unet.adm_in_channels; } void compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) override { return unet.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -103,38 +103,38 @@ struct MMDiTModel : public DiffusionModel { : mmdit(backend, offload_params_to_cpu, flash_attn, tensor_types, "model.diffusion_model") { } - std::string get_desc() { + std::string get_desc() override { return mmdit.get_desc(); } - void alloc_params_buffer() { + void alloc_params_buffer() override { mmdit.alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { mmdit.free_params_buffer(); } - void free_compute_buffer() { + void free_compute_buffer() override { mmdit.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { mmdit.get_param_tensors(tensors, "model.diffusion_model"); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { return mmdit.get_params_buffer_size(); } - int64_t get_adm_in_channels() { + int64_t get_adm_in_channels() override { return 768 + 1280; } void compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) override { return mmdit.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -158,38 +158,38 @@ struct FluxModel : public DiffusionModel { : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) { } - std::string get_desc() { + std::string get_desc() override { return flux.get_desc(); } - void alloc_params_buffer() { + void alloc_params_buffer() override { flux.alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { flux.free_params_buffer(); } - void free_compute_buffer() { + void free_compute_buffer() override { flux.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { flux.get_param_tensors(tensors, "model.diffusion_model"); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { return flux.get_params_buffer_size(); } - int64_t get_adm_in_channels() { + int64_t get_adm_in_channels() override { return 768; } void compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) override { return flux.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, @@ -218,45 +218,45 @@ struct WanModel : public DiffusionModel { : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) { } - std::string get_desc() { + std::string get_desc() override { return wan.get_desc(); } - void alloc_params_buffer() { + void alloc_params_buffer() override { wan.alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { wan.free_params_buffer(); } - void free_compute_buffer() { + void free_compute_buffer() override { wan.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { wan.get_param_tensors(tensors, prefix); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { return wan.get_params_buffer_size(); } - int64_t get_adm_in_channels() { + int64_t get_adm_in_channels() override { return 768; } void compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) override { return wan.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, diffusion_params.context, diffusion_params.y, diffusion_params.c_concat, - NULL, + nullptr, diffusion_params.vace_context, diffusion_params.vace_strength, output, @@ -277,38 +277,38 @@ struct QwenImageModel : public DiffusionModel { : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) { } - std::string get_desc() { + std::string get_desc() override { return qwen_image.get_desc(); } - void alloc_params_buffer() { + void alloc_params_buffer() override { qwen_image.alloc_params_buffer(); } - void free_params_buffer() { + void free_params_buffer() override { qwen_image.free_params_buffer(); } - void free_compute_buffer() { + void free_compute_buffer() override { qwen_image.free_compute_buffer(); } - void get_param_tensors(std::map& tensors) { + void get_param_tensors(std::map& tensors) override { qwen_image.get_param_tensors(tensors, prefix); } - size_t get_params_buffer_size() { + size_t get_params_buffer_size() override { return qwen_image.get_params_buffer_size(); } - int64_t get_adm_in_channels() { + int64_t get_adm_in_channels() override { return 768; } void compute(int n_threads, DiffusionParams diffusion_params, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) override { return qwen_image.compute(n_threads, diffusion_params.x, diffusion_params.timesteps, diff --git a/esrgan.hpp b/esrgan.hpp index fe5f16d2..fa18532d 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -156,9 +156,10 @@ struct ESRGAN : public GGMLRunner { ESRGAN(ggml_backend_t backend, bool offload_params_to_cpu, + int tile_size = 128, const String2GGMLType& tensor_types = {}) : GGMLRunner(backend, offload_params_to_cpu) { - // rrdb_net will be created in load_from_file + this->tile_size = tile_size; } void enable_conv2d_direct() { @@ -174,7 +175,7 @@ struct ESRGAN : public GGMLRunner { } } - std::string get_desc() { + std::string get_desc() override { return "esrgan"; } @@ -367,7 +368,7 @@ struct ESRGAN : public GGMLRunner { void compute(const int n_threads, struct ggml_tensor* x, ggml_tensor** output, - ggml_context* output_ctx = NULL) { + ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(x); }; diff --git a/examples/cli/README.md b/examples/cli/README.md index 6e8ddd48..ee17d17d 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -1,113 +1,110 @@ # Run ``` -usage: ./bin/sd [arguments] +usage: ./bin/sd [options] -arguments: - -h, --help show this help message and exit - -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen - -t, --threads N number of threads to use during computation (default: -1) - If threads <= 0, then threads will be set to the number of CPU physical cores - --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed - -m, --model [MODEL] path to full model - --diffusion-model path to the standalone diffusion model - --high-noise-diffusion-model path to the standalone high noise diffusion model - --clip_l path to the clip-l text encoder - --clip_g path to the clip-g text encoder - --clip_vision path to the clip-vision encoder - --t5xxl path to the t5xxl text encoder - --qwen2vl path to the qwen2vl text encoder - --qwen2vl_vision path to the qwen2vl vit - --vae [VAE] path to vae - --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) - --control-net [CONTROL_PATH] path to control net model - --embd-dir [EMBEDDING_PATH] path to embeddings - --upscale-model [ESRGAN_PATH] path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now - --upscale-repeats Run the ESRGAN upscaler this many times (default 1) - --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K) - If not specified, the default is the type of the weight file - --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") - --lora-model-dir [DIR] lora model directory - -i, --init-img [IMAGE] path to the init image, required by img2img - --mask [MASK] path to the mask image, required by img2img with mask - -i, --end-img [IMAGE] path to the end image, required by flf2v - --control-image [IMAGE] path to image condition, control net - -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) - --control-video [PATH] path to control video frames, It must be a directory path. - The video frames inside should be stored as images in lexicographical (character) order - For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc. - --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). - -o, --output OUTPUT path to write result image to (default: ./output.png) - -p, --prompt [PROMPT] the prompt to render - -n, --negative-prompt PROMPT the negative prompt (default: "") - --cfg-scale SCALE unconditional guidance scale: (default: 7.0) - --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) - --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5) - --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0) - 0 means disabled, a value of 2.5 is nice for sd3.5 medium - --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0) - --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9]) - --skip-layer-start START SLG enabling point: (default: 0.01) - --skip-layer-end END SLG disabling point: (default: 0.2) - --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete) - --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd} - sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise) - --timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant - --steps STEPS number of sample steps (default: 20) - --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0) - --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) - --high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5) - --high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) - 0 means disabled, a value of 2.5 is nice for sd3.5 medium - --high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0) - --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9]) - --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01) - --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2) - --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete) - --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd} - (high noise) sampling method (default: "euler_a") - --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto) - SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END]) - --strength STRENGTH strength for noising/unnoising (default: 0.75) - --control-strength STRENGTH strength to apply Control Net (default: 0.9) - 1.0 corresponds to full destruction of information in init image - -H, --height H image height, in pixel space (default: 512) - -W, --width W image width, in pixel space (default: 512) - --rng {std_default, cuda} RNG (default: cuda) - -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0) - -b, --batch-count COUNT number of images to generate - --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override - --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1) - <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x - --vae-tiling process vae in tiles to reduce memory usage - --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32) - --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) - --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5) - --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae - --vae-on-cpu keep vae in cpu (for low vram) - --clip-on-cpu keep clip in cpu (for low vram) - --diffusion-fa use flash attention in the diffusion model (for low vram) - Might lower quality, since it implies converting k and v to f16. - This might crash if it is not supported by the backend. - --diffusion-conv-direct use Conv2d direct in the diffusion model - This might crash if it is not supported by the backend. - --vae-conv-direct use Conv2d direct in the vae model (should improve the performance) - This might crash if it is not supported by the backend. - --control-net-cpu keep controlnet in cpu (for low vram) - --canny apply canny preprocessor (edge detection) - --color colors the logging tags according to level - --chroma-disable-dit-mask disable dit mask for chroma - --chroma-enable-t5-mask enable t5 mask for chroma - --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma - --video-frames video frames (default: 1) - --fps fps (default: 24) - --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875) - only enabled if `--high-noise-steps` is set to -1 - --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto) - --vace-strength wan vace strength - --photo-maker path to PHOTOMAKER model - --pm-id-images-dir [DIR] path to PHOTOMAKER input id images dir - --pm-id-embed-path [PATH] path to PHOTOMAKER v2 id embed - --pm-style-strength strength for keeping PHOTOMAKER input identity (default: 20) - -v, --verbose print extra info +Options: + -m, --model path to full model + --clip_l path to the clip-l text encoder + --clip_g path to the clip-g text encoder + --clip_vision path to the clip-vision encoder + --t5xxl path to the t5xxl text encoder + --qwen2vl path to the qwen2vl text encoder + --qwen2vl_vision path to the qwen2vl vit + --diffusion-model path to the standalone diffusion model + --high-noise-diffusion-model path to the standalone high noise diffusion model + --vae path to standalone vae model + --taesd path to taesd. Using Tiny AutoEncoder for fast decoding (low quality) + --control-net path to control net model + --embd-dir embeddings directory + --lora-model-dir lora model directory + -i, --init-img path to the init image + --end-img path to the end image, required by flf2v + --tensor-type-rules weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0") + --photo-maker path to PHOTOMAKER model + --pm-id-images-dir path to PHOTOMAKER input id images dir + --pm-id-embed-path path to PHOTOMAKER v2 id embed + --mask path to the mask image + --control-image path to control image, control net + --control-video path to control video frames, It must be a directory path. The video frames inside should be stored as images in + lexicographical (character) order. For example, if the control video path is + `frames`, the directory contain images such as 00.png, 01.png, ... etc. + -o, --output path to write result image to (default: ./output.png) + -p, --prompt the prompt to render + -n, --negative-prompt the negative prompt (default: "") + --upscale-model path to esrgan model. + -t, --threads number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of + CPU physical cores + --upscale-repeats Run the ESRGAN upscaler this many times (default: 1) + -H, --height image height, in pixel space (default: 512) + -W, --width image width, in pixel space (default: 512) + --steps number of sample steps (default: 20) + --high-noise-steps (high noise) number of sample steps (default: -1 = auto) + --clip-skip ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified, + will be 1 for SD1.x, 2 for SD2.x + -b, --batch-count batch count + --chroma-t5-mask-pad t5 mask pad size of chroma + --video-frames video frames (default: 1) + --fps fps (default: 24) + --timestep-shift shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for + NitroSD-Vibrant + --cfg-scale unconditional guidance scale: (default: 7.0) + --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) + --guidance distilled guidance scale for models with guidance input (default: 3.5) + --slg-scale skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 + medium + --skip-layer-start SLG enabling point (default: 0.01) + --skip-layer-end SLG disabling point (default: 0.2) + --eta eta in DDIM, only for DDIM and TCD (default: 0) + --high-noise-cfg-scale (high noise) unconditional guidance scale: (default: 7.0) + --high-noise-img-cfg-scale (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale) + --high-noise-guidance (high noise) distilled guidance scale for models with guidance input (default: 3.5) + --high-noise-slg-scale (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0) + --high-noise-skip-layer-start (high noise) SLG enabling point (default: 0.01) + --high-noise-skip-layer-end (high noise) SLG disabling point (default: 0.2) + --high-noise-eta (high noise) eta in DDIM, only for DDIM and TCD (default: 0) + --strength strength for noising/unnoising (default: 0.75) + --pm-style-strength + --control-strength strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image + --moe-boundary timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1 + --flow-shift shift value for Flow models like SD3.x or WAN (default: auto) + --vace-strength wan vace strength + --vae-tile-overlap tile overlap for vae tiling, in fraction of tile size (default: 0.5) + --vae-tiling process vae in tiles to reduce memory usage + --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae + --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed + --control-net-cpu keep controlnet in cpu (for low vram) + --clip-on-cpu keep clip in cpu (for low vram) + --vae-on-cpu keep vae in cpu (for low vram) + --diffusion-fa use flash attention in the diffusion model + --diffusion-conv-direct use ggml_conv2d_direct in the diffusion model + --vae-conv-direct use ggml_conv2d_direct in the vae model + --canny apply canny preprocessor (edge detection) + -v, --verbose print extra info + --color colors the logging tags according to level + --chroma-disable-dit-mask disable dit mask for chroma + --chroma-enable-t5-mask enable t5 mask for chroma + --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). + --disable-auto-resize-ref-image disable auto resize of ref images + -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen + --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the + type of the weight file + --rng RNG, one of [std_default, cuda], default: cuda + -s, --seed RNG seed (default: 42, use random seed for < 0) + --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, + tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) + --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow] + --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: + discrete + --skip-layers layers to skip for SLG steps (default: [7,8,9]) + --high-noise-sampling-method (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, + ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise + --high-noise-scheduler (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, + simple], default: discrete + --high-noise-skip-layers (high noise) layers to skip for SLG steps (default: [7,8,9]) + -r, --ref-image reference image for Flux Kontext models (can be used multiple times) + -h, --help show this help message and exit + --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) + --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 + (overrides --vae-tile-size) ``` \ No newline at end of file diff --git a/examples/cli/avi_writer.h b/examples/cli/avi_writer.h index 8cfb9a57..84b204af 100644 --- a/examples/cli/avi_writer.h +++ b/examples/cli/avi_writer.h @@ -1,10 +1,10 @@ #ifndef __AVI_WRITER_H__ #define __AVI_WRITER_H__ -#include -#include -#include -#include +#include +#include +#include +#include #include "stable-diffusion.h" @@ -130,7 +130,7 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int write_u32_le(f, 0); // Colors important // 'movi' LIST (video frames) - long movi_list_pos = ftell(f); + // long movi_list_pos = ftell(f); fwrite("LIST", 4, 1, f); long movi_size_pos = ftell(f); write_u32_le(f, 0); // Placeholder for movi size @@ -149,7 +149,7 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int } jpeg_data; for (int i = 0; i < num_images; i++) { - jpeg_data.buf = NULL; + jpeg_data.buf = nullptr; jpeg_data.size = 0; // Callback function to collect JPEG data into memory diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index ff36cea2..24f81032 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -80,7 +81,8 @@ struct SDParams { std::string control_image_path; std::vector ref_image_paths; std::string control_video_path; - bool increase_ref_index = false; + bool auto_resize_ref_image = true; + bool increase_ref_index = false; std::string prompt; std::string negative_prompt; @@ -116,6 +118,7 @@ struct SDParams { bool canny_preprocess = false; bool color = false; int upscale_repeats = 1; + int upscale_tile_size = 128; // Photo Maker std::string photo_maker_path; @@ -175,6 +178,7 @@ void print_params(SDParams params) { printf(" %s\n", path.c_str()); }; printf(" control_video_path: %s\n", params.control_video_path.c_str()); + printf(" auto_resize_ref_image: %s\n", params.auto_resize_ref_image ? "true" : "false"); printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false"); printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false"); printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false"); @@ -201,6 +205,7 @@ void print_params(SDParams params) { printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false"); printf(" force_sdxl_vae_conv_scale: %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); + printf(" upscale_tile_size: %d\n", params.upscale_tile_size); printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false"); printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad); @@ -211,118 +216,6 @@ void print_params(SDParams params) { free(high_noise_sample_params_str); } -void print_usage(int argc, const char* argv[]) { - printf("usage: %s [arguments]\n", argv[0]); - printf("\n"); - printf("arguments:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen\n"); - printf(" -t, --threads N number of threads to use during computation (default: -1)\n"); - printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n"); - printf(" --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n"); - printf(" -m, --model [MODEL] path to full model\n"); - printf(" --diffusion-model path to the standalone diffusion model\n"); - printf(" --high-noise-diffusion-model path to the standalone high noise diffusion model\n"); - printf(" --clip_l path to the clip-l text encoder\n"); - printf(" --clip_g path to the clip-g text encoder\n"); - printf(" --clip_vision path to the clip-vision encoder\n"); - printf(" --t5xxl path to the t5xxl text encoder\n"); - printf(" --qwen2vl path to the qwen2vl text encoder\n"); - printf(" --qwen2vl_vision path to the qwen2vl vit\n"); - printf(" --vae [VAE] path to vae\n"); - printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); - printf(" --control-net [CONTROL_PATH] path to control net model\n"); - printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n"); - printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n"); - printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n"); - printf(" --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n"); - printf(" If not specified, the default is the type of the weight file\n"); - printf(" --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n"); - printf(" --lora-model-dir [DIR] lora model directory\n"); - printf(" -i, --init-img [IMAGE] path to the init image, required by img2img\n"); - printf(" --mask [MASK] path to the mask image, required by img2img with mask\n"); - printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n"); - printf(" --control-image [IMAGE] path to image condition, control net\n"); - printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n"); - printf(" --control-video [PATH] path to control video frames, It must be a directory path.\n"); - printf(" The video frames inside should be stored as images in lexicographical (character) order\n"); - printf(" For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n"); - printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n"); - printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n"); - printf(" -p, --prompt [PROMPT] the prompt to render\n"); - printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n"); - printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n"); - printf(" --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); - printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n"); - printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); - printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); - printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); - printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n"); - printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n"); - printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); - printf(" --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n"); - printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); - printf(" sampling method (default: \"euler\" for Flux/SD3/Wan, \"euler_a\" otherwise)\n"); - printf(" --timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n"); - printf(" --steps STEPS number of sample steps (default: 20)\n"); - printf(" --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)\n"); - printf(" --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); - printf(" --high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5)\n"); - printf(" --high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); - printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); - printf(" --high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)\n"); - printf(" --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n"); - printf(" --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)\n"); - printf(" --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)\n"); - printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n"); - printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); - printf(" (high noise) sampling method (default: \"euler_a\")\n"); - printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)\n"); - printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n"); - printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n"); - printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n"); - printf(" 1.0 corresponds to full destruction of information in init image\n"); - printf(" -H, --height H image height, in pixel space (default: 512)\n"); - printf(" -W, --width W image width, in pixel space (default: 512)\n"); - printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); - printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); - printf(" -b, --batch-count COUNT number of images to generate\n"); - printf(" --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override.\n"); - printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); - printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); - printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); - printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n"); - printf(" --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n"); - printf(" --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n"); - printf(" --force-sdxl-vae-conv-scale force use of conv scale on sdxl vae\n"); - printf(" --vae-on-cpu keep vae in cpu (for low vram)\n"); - printf(" --clip-on-cpu keep clip in cpu (for low vram)\n"); - printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n"); - printf(" Might lower quality, since it implies converting k and v to f16.\n"); - printf(" This might crash if it is not supported by the backend.\n"); - printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model\n"); - printf(" This might crash if it is not supported by the backend.\n"); - printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)\n"); - printf(" This might crash if it is not supported by the backend.\n"); - printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); - printf(" --canny apply canny preprocessor (edge detection)\n"); - printf(" --color colors the logging tags according to level\n"); - printf(" --chroma-disable-dit-mask disable dit mask for chroma\n"); - printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n"); - printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n"); - printf(" --video-frames video frames (default: 1)\n"); - printf(" --fps fps (default: 24)\n"); - printf(" --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)\n"); - printf(" only enabled if `--high-noise-steps` is set to -1\n"); - printf(" --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)\n"); - printf(" --vace-strength wan vace strength\n"); - printf(" --photo-maker path to PHOTOMAKER model\n"); - printf(" --pm-id-images-dir [DIR] path to PHOTOMAKER input id images dir\n"); - printf(" --pm-id-embed-path [PATH] path to PHOTOMAKER v2 id embed\n"); - printf(" --pm-style-strength strength for keeping PHOTOMAKER input identity (default: 20)\n"); - printf(" -v, --verbose print extra info\n"); -} - #if defined(_WIN32) static std::string utf16_to_utf8(const std::wstring& wstr) { if (wstr.empty()) @@ -492,93 +385,428 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) { return true; } +static std::string wrap_text(const std::string& text, size_t width, size_t indent) { + std::ostringstream oss; + size_t line_len = 0; + size_t pos = 0; + + while (pos < text.size()) { + // Preserve manual newlines + if (text[pos] == '\n') { + oss << '\n' + << std::string(indent, ' '); + line_len = indent; + ++pos; + continue; + } + + // Add the character + oss << text[pos]; + ++line_len; + ++pos; + + // If the current line exceeds width, try to break at the last space + if (line_len >= width) { + std::string current = oss.str(); + size_t back = current.size(); + + // Find the last space (for a clean break) + while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n') + --back; + + // If found a space to break on + if (back > 0 && current[back - 1] != '\n') { + std::string before = current.substr(0, back - 1); + std::string after = current.substr(back); + oss.str(""); + oss.clear(); + oss << before << "\n" + << std::string(indent, ' ') << after; + } else { + // If no space found, just break at width + oss << "\n" + << std::string(indent, ' '); + } + line_len = indent; + } + } + + return oss.str(); +} + +void print_usage(int argc, const char* argv[], const ArgOptions& options) { + constexpr size_t max_line_width = 120; + + std::cout << "Usage: " << argv[0] << " [options]\n\n"; + std::cout << "Options:\n"; + + struct Entry { + std::string names; + std::string desc; + }; + std::vector entries; + + auto add_entry = [&](const std::string& s, const std::string& l, + const std::string& desc, const std::string& hint = "") { + std::ostringstream ss; + if (!s.empty()) + ss << s; + if (!s.empty() && !l.empty()) + ss << ", "; + if (!l.empty()) + ss << l; + if (!hint.empty()) + ss << " " << hint; + entries.push_back({ss.str(), desc}); + }; + + for (auto& o : options.string_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : options.int_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : options.float_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : options.bool_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : options.manual_options) + add_entry(o.short_name, o.long_name, o.desc); + + size_t max_name_width = 0; + for (auto& e : entries) + max_name_width = std::max(max_name_width, e.names.size()); + + for (auto& e : entries) { + size_t indent = 2 + max_name_width + 4; + size_t desc_width = (max_line_width > indent ? max_line_width - indent : 40); + std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent); + std::cout << " " << std::left << std::setw(static_cast(max_name_width) + 4) + << e.names << wrapped_desc << "\n"; + } +} + void parse_args(int argc, const char** argv, SDParams& params) { ArgOptions options; options.string_options = { - {"-m", "--model", "", ¶ms.model_path}, - {"", "--clip_l", "", ¶ms.clip_l_path}, - {"", "--clip_g", "", ¶ms.clip_g_path}, - {"", "--clip_vision", "", ¶ms.clip_vision_path}, - {"", "--t5xxl", "", ¶ms.t5xxl_path}, - {"", "--qwen2vl", "", ¶ms.qwen2vl_path}, - {"", "--qwen2vl_vision", "", ¶ms.qwen2vl_vision_path}, - {"", "--diffusion-model", "", ¶ms.diffusion_model_path}, - {"", "--high-noise-diffusion-model", "", ¶ms.high_noise_diffusion_model_path}, - {"", "--vae", "", ¶ms.vae_path}, - {"", "--taesd", "", ¶ms.taesd_path}, - {"", "--control-net", "", ¶ms.control_net_path}, - {"", "--embd-dir", "", ¶ms.embedding_dir}, - {"", "--lora-model-dir", "", ¶ms.lora_model_dir}, - {"-i", "--init-img", "", ¶ms.init_image_path}, - {"", "--end-img", "", ¶ms.end_image_path}, - {"", "--tensor-type-rules", "", ¶ms.tensor_type_rules}, - {"", "--photo-maker", "", ¶ms.photo_maker_path}, - {"", "--pm-id-images-dir", "", ¶ms.pm_id_images_dir}, - {"", "--pm-id-embed-path", "", ¶ms.pm_id_embed_path}, - {"", "--mask", "", ¶ms.mask_image_path}, - {"", "--control-image", "", ¶ms.control_image_path}, - {"", "--control-video", "", ¶ms.control_video_path}, - {"-o", "--output", "", ¶ms.output_path}, - {"-p", "--prompt", "", ¶ms.prompt}, - {"-n", "--negative-prompt", "", ¶ms.negative_prompt}, - {"", "--upscale-model", "", ¶ms.esrgan_path}, + {"-m", + "--model", + "path to full model", + ¶ms.model_path}, + {"", + "--clip_l", + "path to the clip-l text encoder", ¶ms.clip_l_path}, + {"", "--clip_g", + "path to the clip-g text encoder", + ¶ms.clip_g_path}, + {"", + "--clip_vision", + "path to the clip-vision encoder", + ¶ms.clip_vision_path}, + {"", + "--t5xxl", + "path to the t5xxl text encoder", + ¶ms.t5xxl_path}, + {"", + "--qwen2vl", + "path to the qwen2vl text encoder", + ¶ms.qwen2vl_path}, + {"", + "--qwen2vl_vision", + "path to the qwen2vl vit", + ¶ms.qwen2vl_vision_path}, + {"", + "--diffusion-model", + "path to the standalone diffusion model", + ¶ms.diffusion_model_path}, + {"", + "--high-noise-diffusion-model", + "path to the standalone high noise diffusion model", + ¶ms.high_noise_diffusion_model_path}, + {"", + "--vae", + "path to standalone vae model", + ¶ms.vae_path}, + {"", + "--taesd", + "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)", + ¶ms.taesd_path}, + {"", + "--control-net", + "path to control net model", + ¶ms.control_net_path}, + {"", + "--embd-dir", + "embeddings directory", + ¶ms.embedding_dir}, + {"", + "--lora-model-dir", + "lora model directory", + ¶ms.lora_model_dir}, + {"-i", + "--init-img", + "path to the init image", + ¶ms.init_image_path}, + {"", + "--end-img", + "path to the end image, required by flf2v", + ¶ms.end_image_path}, + {"", + "--tensor-type-rules", + "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", + ¶ms.tensor_type_rules}, + {"", + "--photo-maker", + "path to PHOTOMAKER model", + ¶ms.photo_maker_path}, + {"", + "--pm-id-images-dir", + "path to PHOTOMAKER input id images dir", + ¶ms.pm_id_images_dir}, + {"", + "--pm-id-embed-path", + "path to PHOTOMAKER v2 id embed", + ¶ms.pm_id_embed_path}, + {"", + "--mask", + "path to the mask image", + ¶ms.mask_image_path}, + {"", + "--control-image", + "path to control image, control net", + ¶ms.control_image_path}, + {"", + "--control-video", + "path to control video frames, It must be a directory path. The video frames inside should be stored as images in " + "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images " + "such as 00.png, 01.png, ... etc.", + ¶ms.control_video_path}, + {"-o", + "--output", + "path to write result image to (default: ./output.png)", + ¶ms.output_path}, + {"-p", + "--prompt", + "the prompt to render", + ¶ms.prompt}, + {"-n", + "--negative-prompt", + "the negative prompt (default: \"\")", + ¶ms.negative_prompt}, + {"", + "--upscale-model", + "path to esrgan model.", + ¶ms.esrgan_path}, }; options.int_options = { - {"-t", "--threads", "", ¶ms.n_threads}, - {"", "--upscale-repeats", "", ¶ms.upscale_repeats}, - {"-H", "--height", "", ¶ms.height}, - {"-W", "--width", "", ¶ms.width}, - {"", "--steps", "", ¶ms.sample_params.sample_steps}, - {"", "--high-noise-steps", "", ¶ms.high_noise_sample_params.sample_steps}, - {"", "--clip-skip", "", ¶ms.clip_skip}, - {"-b", "--batch-count", "", ¶ms.batch_count}, - {"", "--chroma-t5-mask-pad", "", ¶ms.chroma_t5_mask_pad}, - {"", "--video-frames", "", ¶ms.video_frames}, - {"", "--fps", "", ¶ms.fps}, - {"", "--timestep-shift", "", ¶ms.sample_params.shifted_timestep}, + {"-t", + "--threads", + "number of threads to use during computation (default: -1). " + "If threads <= 0, then threads will be set to the number of CPU physical cores", + ¶ms.n_threads}, + {"", + "--upscale-repeats", + "Run the ESRGAN upscaler this many times (default: 1)", + ¶ms.upscale_repeats}, + {"", + "--upscale-tile-size", + "tile size for ESRGAN upscaling (default: 128)", + ¶ms.upscale_tile_size}, + {"-H", + "--height", + "image height, in pixel space (default: 512)", + ¶ms.height}, + {"-W", + "--width", + "image width, in pixel space (default: 512)", + ¶ms.width}, + {"", + "--steps", + "number of sample steps (default: 20)", + ¶ms.sample_params.sample_steps}, + {"", + "--high-noise-steps", + "(high noise) number of sample steps (default: -1 = auto)", + ¶ms.high_noise_sample_params.sample_steps}, + {"", + "--clip-skip", + "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). " + "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x", + ¶ms.clip_skip}, + {"-b", + "--batch-count", + "batch count", + ¶ms.batch_count}, + {"", + "--chroma-t5-mask-pad", + "t5 mask pad size of chroma", + ¶ms.chroma_t5_mask_pad}, + {"", + "--video-frames", + "video frames (default: 1)", + ¶ms.video_frames}, + {"", + "--fps", + "fps (default: 24)", + ¶ms.fps}, + {"", + "--timestep-shift", + "shift timestep for NitroFusion models (default: 0). " + "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant", + ¶ms.sample_params.shifted_timestep}, }; options.float_options = { - {"", "--cfg-scale", "", ¶ms.sample_params.guidance.txt_cfg}, - {"", "--img-cfg-scale", "", ¶ms.sample_params.guidance.img_cfg}, - {"", "--guidance", "", ¶ms.sample_params.guidance.distilled_guidance}, - {"", "--slg-scale", "", ¶ms.sample_params.guidance.slg.scale}, - {"", "--skip-layer-start", "", ¶ms.sample_params.guidance.slg.layer_start}, - {"", "--skip-layer-end", "", ¶ms.sample_params.guidance.slg.layer_end}, - {"", "--eta", "", ¶ms.sample_params.eta}, - {"", "--high-noise-cfg-scale", "", ¶ms.high_noise_sample_params.guidance.txt_cfg}, - {"", "--high-noise-img-cfg-scale", "", ¶ms.high_noise_sample_params.guidance.img_cfg}, - {"", "--high-noise-guidance", "", ¶ms.high_noise_sample_params.guidance.distilled_guidance}, - {"", "--high-noise-slg-scale", "", ¶ms.high_noise_sample_params.guidance.slg.scale}, - {"", "--high-noise-skip-layer-start", "", ¶ms.high_noise_sample_params.guidance.slg.layer_start}, - {"", "--high-noise-skip-layer-end", "", ¶ms.high_noise_sample_params.guidance.slg.layer_end}, - {"", "--high-noise-eta", "", ¶ms.high_noise_sample_params.eta}, - {"", "--strength", "", ¶ms.strength}, - {"", "--pm-style-strength", "", ¶ms.pm_style_strength}, - {"", "--control-strength", "", ¶ms.control_strength}, - {"", "--moe-boundary", "", ¶ms.moe_boundary}, - {"", "--flow-shift", "", ¶ms.flow_shift}, - {"", "--vace-strength", "", ¶ms.vace_strength}, - {"", "--vae-tile-overlap", "", ¶ms.vae_tiling_params.target_overlap}, + {"", + "--cfg-scale", + "unconditional guidance scale: (default: 7.0)", + ¶ms.sample_params.guidance.txt_cfg}, + {"", + "--img-cfg-scale", + "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", + ¶ms.sample_params.guidance.img_cfg}, + {"", + "--guidance", + "distilled guidance scale for models with guidance input (default: 3.5)", + ¶ms.sample_params.guidance.distilled_guidance}, + {"", + "--slg-scale", + "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium", + ¶ms.sample_params.guidance.slg.scale}, + {"", + "--skip-layer-start", + "SLG enabling point (default: 0.01)", + ¶ms.sample_params.guidance.slg.layer_start}, + {"", + "--skip-layer-end", + "SLG disabling point (default: 0.2)", + ¶ms.sample_params.guidance.slg.layer_end}, + {"", + "--eta", + "eta in DDIM, only for DDIM and TCD (default: 0)", + ¶ms.sample_params.eta}, + {"", + "--high-noise-cfg-scale", + "(high noise) unconditional guidance scale: (default: 7.0)", + ¶ms.high_noise_sample_params.guidance.txt_cfg}, + {"", + "--high-noise-img-cfg-scale", + "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)", + ¶ms.high_noise_sample_params.guidance.img_cfg}, + {"", + "--high-noise-guidance", + "(high noise) distilled guidance scale for models with guidance input (default: 3.5)", + ¶ms.high_noise_sample_params.guidance.distilled_guidance}, + {"", + "--high-noise-slg-scale", + "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)", + ¶ms.high_noise_sample_params.guidance.slg.scale}, + {"", + "--high-noise-skip-layer-start", + "(high noise) SLG enabling point (default: 0.01)", + ¶ms.high_noise_sample_params.guidance.slg.layer_start}, + {"", + "--high-noise-skip-layer-end", + "(high noise) SLG disabling point (default: 0.2)", + ¶ms.high_noise_sample_params.guidance.slg.layer_end}, + {"", + "--high-noise-eta", + "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)", + ¶ms.high_noise_sample_params.eta}, + {"", + "--strength", + "strength for noising/unnoising (default: 0.75)", + ¶ms.strength}, + {"", + "--pm-style-strength", + "", + ¶ms.pm_style_strength}, + {"", + "--control-strength", + "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image", + ¶ms.control_strength}, + {"", + "--moe-boundary", + "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1", + ¶ms.moe_boundary}, + {"", + "--flow-shift", + "shift value for Flow models like SD3.x or WAN (default: auto)", + ¶ms.flow_shift}, + {"", + "--vace-strength", + "wan vace strength", + ¶ms.vace_strength}, + {"", + "--vae-tile-overlap", + "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", + ¶ms.vae_tiling_params.target_overlap}, }; options.bool_options = { - {"", "--vae-tiling", "", true, ¶ms.vae_tiling_params.enabled}, - {"", "--force-sdxl-vae-conv-scale", "", true, ¶ms.force_sdxl_vae_conv_scale}, - {"", "--offload-to-cpu", "", true, ¶ms.offload_params_to_cpu}, - {"", "--control-net-cpu", "", true, ¶ms.control_net_cpu}, - {"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu}, - {"", "--vae-on-cpu", "", true, ¶ms.vae_on_cpu}, - {"", "--diffusion-fa", "", true, ¶ms.diffusion_flash_attn}, - {"", "--diffusion-conv-direct", "", true, ¶ms.diffusion_conv_direct}, - {"", "--vae-conv-direct", "", true, ¶ms.vae_conv_direct}, - {"", "--canny", "", true, ¶ms.canny_preprocess}, - {"-v", "--verbose", "", true, ¶ms.verbose}, - {"", "--color", "", true, ¶ms.color}, - {"", "--chroma-disable-dit-mask", "", false, ¶ms.chroma_use_dit_mask}, - {"", "--chroma-enable-t5-mask", "", true, ¶ms.chroma_use_t5_mask}, - {"", "--increase-ref-index", "", true, ¶ms.increase_ref_index}, + {"", + "--vae-tiling", + "process vae in tiles to reduce memory usage", + true, ¶ms.vae_tiling_params.enabled}, + {"", + "--force-sdxl-vae-conv-scale", + "force use of conv scale on sdxl vae", + true, ¶ms.force_sdxl_vae_conv_scale}, + {"", + "--offload-to-cpu", + "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", + true, ¶ms.offload_params_to_cpu}, + {"", + "--control-net-cpu", + "keep controlnet in cpu (for low vram)", + true, ¶ms.control_net_cpu}, + {"", + "--clip-on-cpu", + "keep clip in cpu (for low vram)", + true, ¶ms.clip_on_cpu}, + {"", + "--vae-on-cpu", + "keep vae in cpu (for low vram)", + true, ¶ms.vae_on_cpu}, + {"", + "--diffusion-fa", + "use flash attention in the diffusion model", + true, ¶ms.diffusion_flash_attn}, + {"", + "--diffusion-conv-direct", + "use ggml_conv2d_direct in the diffusion model", + true, ¶ms.diffusion_conv_direct}, + {"", + "--vae-conv-direct", + "use ggml_conv2d_direct in the vae model", + true, ¶ms.vae_conv_direct}, + {"", + "--canny", + "apply canny preprocessor (edge detection)", + true, ¶ms.canny_preprocess}, + {"-v", + "--verbose", + "print extra info", + true, ¶ms.verbose}, + {"", + "--color", + "colors the logging tags according to level", + true, ¶ms.color}, + {"", + "--chroma-disable-dit-mask", + "disable dit mask for chroma", + false, ¶ms.chroma_use_dit_mask}, + {"", + "--chroma-enable-t5-mask", + "enable t5 mask for chroma", + true, ¶ms.chroma_use_t5_mask}, + {"", + "--increase-ref-index", + "automatically increase the indices of references images based on the order they are listed (starting with 1).", + true, ¶ms.increase_ref_index}, + {"", + "--disable-auto-resize-ref-image", + "disable auto resize of ref images", + false, ¶ms.auto_resize_ref_image}, }; auto on_mode_arg = [&](int argc, const char** argv, int index) { @@ -586,7 +814,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { return -1; } const char* mode = argv[index]; - if (mode != NULL) { + if (mode != nullptr) { int mode_found = -1; for (int i = 0; i < MODE_COUNT; i++) { if (!strcmp(mode, modes_str[i])) { @@ -711,7 +939,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { }; auto on_help_arg = [&](int argc, const char** argv, int index) { - print_usage(argc, argv); + print_usage(argc, argv, options); exit(0); return 0; }; @@ -825,25 +1053,73 @@ void parse_args(int argc, const char** argv, SDParams& params) { }; options.manual_options = { - {"-M", "--mode", "", on_mode_arg}, - {"", "--type", "", on_type_arg}, - {"", "--rng", "", on_rng_arg}, - {"-s", "--seed", "", on_seed_arg}, - {"", "--sampling-method", "", on_sample_method_arg}, - {"", "--prediction", "", on_prediction_arg}, - {"", "--scheduler", "", on_schedule_arg}, - {"", "--skip-layers", "", on_skip_layers_arg}, - {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg}, - {"", "--high-noise-scheduler", "", on_high_noise_schedule_arg}, - {"", "--high-noise-skip-layers", "", on_high_noise_skip_layers_arg}, - {"-r", "--ref-image", "", on_ref_image_arg}, - {"-h", "--help", "", on_help_arg}, - {"", "--vae-tile-size", "", on_tile_size_arg}, - {"", "--vae-relative-tile-size", "", on_relative_tile_size_arg}, + {"-M", + "--mode", + "run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen", + on_mode_arg}, + {"", + "--type", + "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). " + "If not specified, the default is the type of the weight file", + on_type_arg}, + {"", + "--rng", + "RNG, one of [std_default, cuda], default: cuda", + on_rng_arg}, + {"-s", + "--seed", + "RNG seed (default: 42, use random seed for < 0)", + on_seed_arg}, + {"", + "--sampling-method", + "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] " + "(default: euler for Flux/SD3/Wan, euler_a otherwise)", + on_sample_method_arg}, + {"", + "--prediction", + "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]", + on_prediction_arg}, + {"", + "--scheduler", + "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete", + on_schedule_arg}, + {"", + "--skip-layers", + "layers to skip for SLG steps (default: [7,8,9])", + on_skip_layers_arg}, + {"", + "--high-noise-sampling-method", + "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]" + " default: euler for Flux/SD3/Wan, euler_a otherwise", + on_high_noise_sample_method_arg}, + {"", + "--high-noise-scheduler", + "(high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete", + on_high_noise_schedule_arg}, + {"", + "--high-noise-skip-layers", + "(high noise) layers to skip for SLG steps (default: [7,8,9])", + on_high_noise_skip_layers_arg}, + {"-r", + "--ref-image", + "reference image for Flux Kontext models (can be used multiple times)", + on_ref_image_arg}, + {"-h", + "--help", + "show this help message and exit", + on_help_arg}, + {"", + "--vae-tile-size", + "tile size for vae tiling, format [X]x[Y] (default: 32x32)", + on_tile_size_arg}, + {"", + "--vae-relative-tile-size", + "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", + on_relative_tile_size_arg}, }; if (!parse_options(argc, argv, options)) { - print_usage(argc, argv); + print_usage(argc, argv, options); exit(1); } @@ -853,19 +1129,19 @@ void parse_args(int argc, const char** argv, SDParams& params) { if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) { fprintf(stderr, "error: the following arguments are required: prompt\n"); - print_usage(argc, argv); + print_usage(argc, argv, options); exit(1); } if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) { fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); - print_usage(argc, argv); + print_usage(argc, argv, options); exit(1); } if (params.output_path.length() == 0) { fprintf(stderr, "error: the following arguments are required: output_path\n"); - print_usage(argc, argv); + print_usage(argc, argv, options); exit(1); } @@ -917,6 +1193,11 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } + if (params.upscale_tile_size < 1) { + fprintf(stderr, "error: upscale tile size must be at least 1\n"); + exit(1); + } + if (params.mode == UPSCALE) { if (params.esrgan_path.length() == 0) { fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n"); @@ -929,7 +1210,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { } if (params.seed < 0) { - srand((int)time(NULL)); + srand((int)time(nullptr)); params.seed = rand(); } @@ -1044,9 +1325,9 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { uint8_t* load_image(const char* image_path, int& width, int& height, int expected_width = 0, int expected_height = 0, int expected_channel = 3) { int c = 0; uint8_t* image_buffer = (uint8_t*)stbi_load(image_path, &width, &height, &c, expected_channel); - if (image_buffer == NULL) { + if (image_buffer == nullptr) { fprintf(stderr, "load image from '%s' failed\n", image_path); - return NULL; + return nullptr; } if (c < expected_channel) { fprintf(stderr, @@ -1056,17 +1337,17 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte c, image_path); free(image_buffer); - return NULL; + return nullptr; } if (width <= 0) { fprintf(stderr, "error: the width of image must be greater than 0, image_path = %s\n", image_path); free(image_buffer); - return NULL; + return nullptr; } if (height <= 0) { fprintf(stderr, "error: the height of image must be greater than 0, image_path = %s\n", image_path); free(image_buffer); - return NULL; + return nullptr; } // Resize input image ... @@ -1088,10 +1369,10 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte if (crop_x != 0 || crop_y != 0) { printf("crop input image from %dx%d to %dx%d, image_path = %s\n", width, height, crop_w, crop_h, image_path); uint8_t* cropped_image_buffer = (uint8_t*)malloc(crop_w * crop_h * expected_channel); - if (cropped_image_buffer == NULL) { + if (cropped_image_buffer == nullptr) { fprintf(stderr, "error: allocate memory for crop\n"); free(image_buffer); - return NULL; + return nullptr; } for (int row = 0; row < crop_h; row++) { uint8_t* src = image_buffer + ((crop_y + row) * width + crop_x) * expected_channel; @@ -1110,10 +1391,10 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte int resized_width = expected_width; uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * expected_channel); - if (resized_image_buffer == NULL) { + if (resized_image_buffer == nullptr) { fprintf(stderr, "error: allocate memory for resize input image\n"); free(image_buffer); - return NULL; + return nullptr; } stbir_resize(image_buffer, width, height, 0, resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8, @@ -1164,7 +1445,7 @@ bool load_images_from_dir(const std::string dir, int width = 0; int height = 0; uint8_t* image_buffer = load_image(path.c_str(), width, height, expected_width, expected_height); - if (image_buffer == NULL) { + if (image_buffer == nullptr) { fprintf(stderr, "load image from '%s' failed\n", path.c_str()); return false; } @@ -1216,10 +1497,10 @@ int main(int argc, const char* argv[]) { } bool vae_decode_only = true; - sd_image_t init_image = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL}; - sd_image_t end_image = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL}; - sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL}; - sd_image_t mask_image = {(uint32_t)params.width, (uint32_t)params.height, 1, NULL}; + sd_image_t init_image = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr}; + sd_image_t end_image = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr}; + sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr}; + sd_image_t mask_image = {(uint32_t)params.width, (uint32_t)params.height, 1, nullptr}; std::vector ref_images; std::vector pmid_images; std::vector control_frames; @@ -1231,17 +1512,17 @@ int main(int argc, const char* argv[]) { free(mask_image.data); for (auto image : ref_images) { free(image.data); - image.data = NULL; + image.data = nullptr; } ref_images.clear(); for (auto image : pmid_images) { free(image.data); - image.data = NULL; + image.data = nullptr; } pmid_images.clear(); for (auto image : control_frames) { free(image.data); - image.data = NULL; + image.data = nullptr; } control_frames.clear(); }; @@ -1252,7 +1533,7 @@ int main(int argc, const char* argv[]) { int width = 0; int height = 0; init_image.data = load_image(params.init_image_path.c_str(), width, height, params.width, params.height); - if (init_image.data == NULL) { + if (init_image.data == nullptr) { fprintf(stderr, "load image from '%s' failed\n", params.init_image_path.c_str()); release_all_resources(); return 1; @@ -1265,7 +1546,7 @@ int main(int argc, const char* argv[]) { int width = 0; int height = 0; end_image.data = load_image(params.end_image_path.c_str(), width, height, params.width, params.height); - if (end_image.data == NULL) { + if (end_image.data == nullptr) { fprintf(stderr, "load image from '%s' failed\n", params.end_image_path.c_str()); release_all_resources(); return 1; @@ -1277,7 +1558,7 @@ int main(int argc, const char* argv[]) { int width = 0; int height = 0; mask_image.data = load_image(params.mask_image_path.c_str(), width, height, params.width, params.height, 1); - if (mask_image.data == NULL) { + if (mask_image.data == nullptr) { fprintf(stderr, "load image from '%s' failed\n", params.mask_image_path.c_str()); release_all_resources(); return 1; @@ -1285,7 +1566,7 @@ int main(int argc, const char* argv[]) { } else { mask_image.data = (uint8_t*)malloc(params.width * params.height); memset(mask_image.data, 255, params.width * params.height); - if (mask_image.data == NULL) { + if (mask_image.data == nullptr) { fprintf(stderr, "malloc mask image failed\n"); release_all_resources(); return 1; @@ -1296,7 +1577,7 @@ int main(int argc, const char* argv[]) { int width = 0; int height = 0; control_image.data = load_image(params.control_image_path.c_str(), width, height, params.width, params.height); - if (control_image.data == NULL) { + if (control_image.data == nullptr) { fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str()); release_all_resources(); return 1; @@ -1317,7 +1598,7 @@ int main(int argc, const char* argv[]) { int width = 0; int height = 0; uint8_t* image_buffer = load_image(path.c_str(), width, height); - if (image_buffer == NULL) { + if (image_buffer == nullptr) { fprintf(stderr, "load image from '%s' failed\n", path.c_str()); release_all_resources(); return 1; @@ -1399,18 +1680,18 @@ int main(int argc, const char* argv[]) { if (params.mode == UPSCALE) { num_results = 1; results = (sd_image_t*)calloc(num_results, sizeof(sd_image_t)); - if (results == NULL) { + if (results == nullptr) { printf("failed to allocate results array\n"); release_all_resources(); return 1; } results[0] = init_image; - init_image.data = NULL; + init_image.data = nullptr; } else { sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params); - if (sd_ctx == NULL) { + if (sd_ctx == nullptr) { printf("new_sd_ctx_t failed\n"); release_all_resources(); return 1; @@ -1428,6 +1709,7 @@ int main(int argc, const char* argv[]) { init_image, ref_images.data(), (int)ref_images.size(), + params.auto_resize_ref_image, params.increase_ref_index, mask_image, params.width, @@ -1472,7 +1754,7 @@ int main(int argc, const char* argv[]) { results = generate_video(sd_ctx, &vid_gen_params, &num_results); } - if (results == NULL) { + if (results == nullptr) { printf("generate failed\n"); free_sd_ctx(sd_ctx); return 1; @@ -1486,19 +1768,20 @@ int main(int argc, const char* argv[]) { upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), params.offload_params_to_cpu, params.diffusion_conv_direct, - params.n_threads); + params.n_threads, + params.upscale_tile_size); - if (upscaler_ctx == NULL) { + if (upscaler_ctx == nullptr) { printf("new_upscaler_ctx failed\n"); } else { for (int i = 0; i < num_results; i++) { - if (results[i].data == NULL) { + if (results[i].data == nullptr) { continue; } sd_image_t current_image = results[i]; for (int u = 0; u < params.upscale_repeats; ++u) { sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor); - if (upscaled_image.data == NULL) { + if (upscaled_image.data == nullptr) { printf("upscale failed\n"); break; } @@ -1556,7 +1839,7 @@ int main(int argc, const char* argv[]) { file_ext = ".png"; } for (int i = 0; i < num_results; i++) { - if (results[i].data == NULL) { + if (results[i].data == nullptr) { continue; } std::string final_image_path = i > 0 ? base_path + "_" + std::to_string(i + 1) + file_ext : base_path + file_ext; @@ -1574,7 +1857,7 @@ int main(int argc, const char* argv[]) { for (int i = 0; i < num_results; i++) { free(results[i].data); - results[i].data = NULL; + results[i].data = nullptr; } free(results); diff --git a/flux.hpp b/flux.hpp index 2ed41041..355184be 100644 --- a/flux.hpp +++ b/flux.hpp @@ -1,6 +1,7 @@ #ifndef __FLUX_HPP__ #define __FLUX_HPP__ +#include #include #include "ggml_extend.hpp" @@ -18,7 +19,7 @@ namespace Flux { blocks["out_layer"] = std::shared_ptr(new Linear(hidden_dim, hidden_dim, true)); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [..., in_dim] // return: [..., hidden_dim] auto in_layer = std::dynamic_pointer_cast(blocks["in_layer"]); @@ -36,7 +37,7 @@ namespace Flux { int64_t hidden_size; float eps; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { ggml_type wtype = GGML_TYPE_F32; params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -47,7 +48,7 @@ namespace Flux { : hidden_size(hidden_size), eps(eps) {} - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* w = params["scale"]; x = ggml_rms_norm(ctx, x, eps); x = ggml_mul(ctx, x, w); @@ -136,11 +137,11 @@ namespace Flux { }; struct ModulationOut { - ggml_tensor* shift = NULL; - ggml_tensor* scale = NULL; - ggml_tensor* gate = NULL; + ggml_tensor* shift = nullptr; + ggml_tensor* scale = nullptr; + ggml_tensor* gate = nullptr; - ModulationOut(ggml_tensor* shift = NULL, ggml_tensor* scale = NULL, ggml_tensor* gate = NULL) + ModulationOut(ggml_tensor* shift = nullptr, ggml_tensor* scale = nullptr, ggml_tensor* gate = nullptr) : shift(shift), scale(scale), gate(gate) {} ModulationOut(struct ggml_context* ctx, ggml_tensor* vec, int64_t offset) { @@ -259,7 +260,7 @@ namespace Flux { struct ggml_tensor* txt, struct ggml_tensor* vec, struct ggml_tensor* pe, - struct ggml_tensor* mask = NULL) { + struct ggml_tensor* mask = nullptr) { // img: [N, n_img_token, hidden_size] // txt: [N, n_txt_token, hidden_size] // pe: [n_img_token + n_txt_token, d_head/2, 2, 2] @@ -398,7 +399,7 @@ namespace Flux { ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) { int64_t offset = 3 * idx; - return ModulationOut(ctx, vec, offset); + return {ctx, vec, offset}; } struct ggml_tensor* forward(struct ggml_context* ctx, @@ -406,7 +407,7 @@ namespace Flux { struct ggml_tensor* x, struct ggml_tensor* vec, struct ggml_tensor* pe, - struct ggml_tensor* mask = NULL) { + struct ggml_tensor* mask = nullptr) { // x: [N, n_token, hidden_size] // pe: [n_token, d_head/2, 2, 2] // return: [N, n_token, hidden_size] @@ -485,7 +486,7 @@ namespace Flux { auto shift = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0)); // [N, dim] auto scale = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1)); // [N, dim] // No gate - return ModulationOut(shift, scale, NULL); + return {shift, scale, nullptr}; } struct ggml_tensor* forward(struct ggml_context* ctx, @@ -664,7 +665,7 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = NULL, + struct ggml_tensor* mod_index_arange = nullptr, std::vector skip_layers = {}) { auto img_in = std::dynamic_pointer_cast(blocks["img_in"]); auto txt_in = std::dynamic_pointer_cast(blocks["txt_in"]); @@ -672,7 +673,7 @@ namespace Flux { img = img_in->forward(ctx, img); struct ggml_tensor* vec; - struct ggml_tensor* txt_img_mask = NULL; + struct ggml_tensor* txt_img_mask = nullptr; if (params.is_chroma) { int64_t mod_index_length = 344; auto approx = std::dynamic_pointer_cast(blocks["distilled_guidance_layer"]); @@ -681,7 +682,7 @@ namespace Flux { // auto mod_index_arange = ggml_arange(ctx, 0, (float)mod_index_length, 1); // ggml_arange tot working on a lot of backends, precomputing it on CPU instead - GGML_ASSERT(arange != NULL); + GGML_ASSERT(arange != nullptr); auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f); // [1, 344, 32] // Batch broadcast (will it ever be useful) @@ -695,7 +696,7 @@ namespace Flux { vec = ggml_cont(ctx, ggml_permute(ctx, vec, 0, 2, 1, 3)); // [344, N, 64] vec = approx->forward(ctx, vec); // [344, N, hidden_size] - if (y != NULL) { + if (y != nullptr) { txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0); } } else { @@ -703,7 +704,7 @@ namespace Flux { auto vector_in = std::dynamic_pointer_cast(blocks["vector_in"]); vec = time_in->forward(ctx, ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f)); if (params.guidance_embed) { - GGML_ASSERT(guidance != NULL); + GGML_ASSERT(guidance != nullptr); auto guidance_in = std::dynamic_pointer_cast(blocks["guidance_in"]); // bf16 and fp16 result is different auto g_in = ggml_nn_timestep_embedding(ctx, guidance, 256, 10000, 1000.f); @@ -775,14 +776,14 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = NULL, + struct ggml_tensor* mod_index_arange = nullptr, std::vector ref_latents = {}, std::vector skip_layers = {}) { // Forward pass of DiT. // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) // timestep: (N,) tensor of diffusion timesteps // context: (N, L, D) - // c_concat: NULL, or for (N,C+M, H, W) for Fill + // c_concat: nullptr, or for (N,C+M, H, W) for Fill // y: (N, adm_in_channels) tensor of class labels // guidance: (N,) // pe: (L, d_head/2, 2, 2) @@ -801,7 +802,7 @@ namespace Flux { uint64_t img_tokens = img->ne[1]; if (params.version == VERSION_FLUX_FILL) { - GGML_ASSERT(c_concat != NULL); + GGML_ASSERT(c_concat != nullptr); ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); @@ -810,7 +811,7 @@ namespace Flux { img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0); } else if (params.version == VERSION_FLEX_2) { - GGML_ASSERT(c_concat != NULL); + GGML_ASSERT(c_concat != nullptr); ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1)); @@ -825,7 +826,7 @@ namespace Flux { img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0); } else if (params.version == VERSION_FLUX_CONTROLS) { - GGML_ASSERT(c_concat != NULL); + GGML_ASSERT(c_concat != nullptr); ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0); control = patchify(ctx, control, patch_size); @@ -924,7 +925,7 @@ namespace Flux { flux.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return "flux"; } @@ -944,18 +945,18 @@ namespace Flux { GGML_ASSERT(x->ne[3] == 1); struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false); - struct ggml_tensor* mod_index_arange = NULL; + struct ggml_tensor* mod_index_arange = nullptr; x = to_backend(x); context = to_backend(context); - if (c_concat != NULL) { + if (c_concat != nullptr) { c_concat = to_backend(c_concat); } if (flux_params.is_chroma) { guidance = ggml_set_f32(guidance, 0); if (!use_mask) { - y = NULL; + y = nullptr; } // ggml_arange is not working on some backends, precompute it @@ -987,7 +988,7 @@ namespace Flux { auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len); // pe->data = pe_vec.data(); // print_ggml_tensor(pe); - // pe->data = NULL; + // pe->data = nullptr; set_backend_tensor_data(pe, pe_vec.data()); struct ggml_tensor* out = flux.forward(compute_ctx, @@ -1017,8 +1018,8 @@ namespace Flux { struct ggml_tensor* guidance, std::vector ref_latents = {}, bool increase_ref_index = false, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL, + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr, std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] @@ -1035,11 +1036,11 @@ namespace Flux { void test() { struct ggml_init_params params; params.mem_size = static_cast(20 * 1024 * 1024); // 20 MB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); { // cpu f16: @@ -1063,10 +1064,10 @@ namespace Flux { ggml_set_f32(y, 0.01f); // print_ggml_tensor(y); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); - compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx); + compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); @@ -1078,7 +1079,7 @@ namespace Flux { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_Q8_0; - std::shared_ptr flux = std::shared_ptr(new FluxRunner(backend, false)); + std::shared_ptr flux = std::make_shared(backend, false); { LOG_INFO("loading from '%s'", file_path.c_str()); diff --git a/format-code.sh b/format-code.sh index 9fdba32e..adad801f 100644 --- a/format-code.sh +++ b/format-code.sh @@ -1,5 +1,8 @@ for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/cli/*.h; do [[ "$f" == vocab* ]] && continue echo "formatting '$f'" + # if [ "$f" != "stable-diffusion.h" ]; then + # clang-tidy -fix -p build_linux/ "$f" + # fi clang-format -style=file -i "$f" done \ No newline at end of file diff --git a/ggml b/ggml index 7bffd79a..c538174d 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 7bffd79a4bec72e9a3bfbedb582a218b84401c13 +Subproject commit c538174d261d8172480f87efcfec8e69aac13ebb diff --git a/ggml_extend.hpp b/ggml_extend.hpp index d8df0d8f..02d82bc0 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -105,7 +105,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_mul_n_mode(struct ggml_context* ctx, return result; } -__STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct ggml_tensor* lora_down, struct ggml_tensor* lora_up, struct ggml_tensor* lora_mid = NULL) { +__STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct ggml_tensor* lora_down, struct ggml_tensor* lora_up, struct ggml_tensor* lora_mid = nullptr) { struct ggml_tensor* updown; // flat lora tensors to multiply it int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1]; @@ -118,7 +118,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct // ggml_mul_mat requires tensor b transposed lora_down = ggml_cont(ctx, ggml_transpose(ctx, lora_down)); - if (lora_mid == NULL) { + if (lora_mid == nullptr) { updown = ggml_mul_mat(ctx, lora_up, lora_down); updown = ggml_cont(ctx, ggml_transpose(ctx, updown)); } else { @@ -165,7 +165,7 @@ __STATIC_INLINE__ void ggml_tensor_set_f32(struct ggml_tensor* tensor, float val } __STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) { - if (tensor->buffer != NULL) { + if (tensor->buffer != nullptr) { float value; ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(float)); return value; @@ -175,7 +175,7 @@ __STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, in } __STATIC_INLINE__ int ggml_tensor_get_i32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) { - if (tensor->buffer != NULL) { + if (tensor->buffer != nullptr) { float value; ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(int)); return value; @@ -292,7 +292,7 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st std::ifstream file(file_path, std::ios::binary); if (!file.is_open()) { LOG_ERROR("failed to open '%s'", file_path.c_str()); - return NULL; + return nullptr; } int32_t n_dims; int32_t length; @@ -306,7 +306,7 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st if (file.eof()) { LOG_ERROR("incomplete file '%s'", file_path.c_str()); - return NULL; + return nullptr; } int32_t nelements = 1; @@ -354,7 +354,7 @@ __STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_ten } struct ggml_init_params params; params.mem_size = 10 * 1024 * 1024; // for padding - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -860,7 +860,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float); // input chunk params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float); // output chunk params.mem_size += 3 * ggml_tensor_overhead(); - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); @@ -961,7 +961,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx, if (scale != 1.f) { x = ggml_scale(ctx, x, 1.f / scale); } - if (b != NULL) { + if (b != nullptr) { x = ggml_add_inplace(ctx, x, b); } return x; @@ -994,7 +994,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, if (scale != 1.f) { x = ggml_scale(ctx, x, 1.f / scale); } - if (b != NULL) { + if (b != nullptr) { b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); x = ggml_add_inplace(ctx, x, b); } @@ -1023,7 +1023,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_3d(struct ggml_context* ctx, int64_t N = x->ne[3] / IC; x = ggml_conv_3d(ctx, w, x, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2); - if (b != NULL) { + if (b != nullptr) { b = ggml_reshape_4d(ctx, b, 1, 1, 1, b->ne[0]); // [OC, 1, 1, 1] x = ggml_add_inplace(ctx, x, b); } @@ -1042,7 +1042,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_3d_nx1x1(struct ggml_context* int p2 = 1, int d2 = 1) { x = ggml_conv_2d(ctx, w, x, 1, s2, 0, p2, 1, d2); // [N, OC, T, OH * OW] - if (b != NULL) { + if (b != nullptr) { b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); x = ggml_add(ctx, x, b); } @@ -1146,7 +1146,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* struct ggml_tensor* k, struct ggml_tensor* v, int64_t n_head, - struct ggml_tensor* mask = NULL, + struct ggml_tensor* mask = nullptr, bool diag_mask_inf = false, bool skip_reshape = false, bool flash_attn = false, // avoid overflow @@ -1293,9 +1293,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_layer_norm(struct ggml_context* ct struct ggml_tensor* b, float eps = EPS) { x = ggml_norm(ctx, x, eps); - if (w != NULL) { + if (w != nullptr) { x = ggml_mul_inplace(ctx, x, w); - if (b != NULL) { + if (b != nullptr) { x = ggml_add_inplace(ctx, x, b); } } @@ -1307,14 +1307,14 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ct struct ggml_tensor* w, struct ggml_tensor* b, int num_groups = 32) { - if (ggml_n_dims(x) >= 3 && w != NULL && b != NULL) { + if (ggml_n_dims(x) >= 3 && w != nullptr && b != nullptr) { w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1); b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); } const float eps = 1e-6f; // default eps parameter x = ggml_group_norm(ctx, x, num_groups, eps); - if (w != NULL && b != NULL) { + if (w != nullptr && b != nullptr) { x = ggml_mul_inplace(ctx, x, w); // b = ggml_repeat(ctx, b, x); x = ggml_add_inplace(ctx, x, b); @@ -1422,7 +1422,7 @@ __STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context // embedding: [N, dim] std::vector embedding_vec = timestep_embedding(timesteps, dim, max_period); struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size()); - if (embedding->data != NULL) { + if (embedding->data != nullptr) { memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding)); } else { ggml_backend_tensor_set(embedding, embedding_vec.data(), 0, ggml_nbytes(embedding)); @@ -1458,23 +1458,23 @@ struct GGMLRunner { protected: typedef std::function get_graph_cb_t; - ggml_backend_t params_backend = NULL; - ggml_backend_t runtime_backend = NULL; + ggml_backend_t params_backend = nullptr; + ggml_backend_t runtime_backend = nullptr; - struct ggml_context* params_ctx = NULL; - ggml_backend_buffer_t params_buffer = NULL; - struct ggml_context* offload_ctx = NULL; - ggml_backend_buffer_t runtime_params_buffer = NULL; + struct ggml_context* params_ctx = nullptr; + ggml_backend_buffer_t params_buffer = nullptr; + struct ggml_context* offload_ctx = nullptr; + ggml_backend_buffer_t runtime_params_buffer = nullptr; bool params_on_runtime_backend = false; - struct ggml_context* cache_ctx = NULL; - ggml_backend_buffer_t cache_buffer = NULL; + struct ggml_context* cache_ctx = nullptr; + ggml_backend_buffer_t cache_buffer = nullptr; - struct ggml_context* compute_ctx = NULL; - struct ggml_gallocr* compute_allocr = NULL; + struct ggml_context* compute_ctx = nullptr; + struct ggml_gallocr* compute_allocr = nullptr; std::vector one_vec = {1.f}; - ggml_tensor* one_tensor = NULL; + ggml_tensor* one_tensor = nullptr; std::map backend_tensor_data_map; std::map cache_tensor_map; // name -> tensor @@ -1483,59 +1483,59 @@ struct GGMLRunner { void alloc_params_ctx() { struct ggml_init_params params; params.mem_size = static_cast(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead()); - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = true; params_ctx = ggml_init(params); - GGML_ASSERT(params_ctx != NULL); + GGML_ASSERT(params_ctx != nullptr); if (params_backend != runtime_backend) { offload_ctx = ggml_init(params); - GGML_ASSERT(offload_ctx != NULL); + GGML_ASSERT(offload_ctx != nullptr); } } void free_params_ctx() { - if (params_ctx != NULL) { + if (params_ctx != nullptr) { ggml_free(params_ctx); - params_ctx = NULL; + params_ctx = nullptr; } - if (offload_ctx != NULL) { + if (offload_ctx != nullptr) { ggml_free(offload_ctx); - offload_ctx = NULL; + offload_ctx = nullptr; } } void alloc_cache_ctx() { struct ggml_init_params params; params.mem_size = static_cast(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead()); - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = true; cache_ctx = ggml_init(params); - GGML_ASSERT(cache_ctx != NULL); + GGML_ASSERT(cache_ctx != nullptr); } void free_cache_ctx() { - if (cache_ctx != NULL) { + if (cache_ctx != nullptr) { ggml_free(cache_ctx); - cache_ctx = NULL; + cache_ctx = nullptr; } } void alloc_compute_ctx() { struct ggml_init_params params; params.mem_size = static_cast(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead()); - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = true; compute_ctx = ggml_init(params); - GGML_ASSERT(compute_ctx != NULL); + GGML_ASSERT(compute_ctx != nullptr); } void free_compute_ctx() { - if (compute_ctx != NULL) { + if (compute_ctx != nullptr) { ggml_free(compute_ctx); - compute_ctx = NULL; + compute_ctx = nullptr; } } @@ -1559,7 +1559,7 @@ struct GGMLRunner { } bool alloc_compute_buffer(get_graph_cb_t get_graph) { - if (compute_allocr != NULL) { + if (compute_allocr != nullptr) { return true; } reset_compute_ctx(); @@ -1584,9 +1584,9 @@ struct GGMLRunner { } void free_cache_buffer() { - if (cache_buffer != NULL) { + if (cache_buffer != nullptr) { ggml_backend_buffer_free(cache_buffer); - cache_buffer = NULL; + cache_buffer = nullptr; } } @@ -1596,7 +1596,7 @@ struct GGMLRunner { } free_cache_ctx_and_buffer(); alloc_cache_ctx(); - GGML_ASSERT(cache_buffer == NULL); + GGML_ASSERT(cache_buffer == nullptr); std::map runtime_tensor_to_cache_tensor; for (auto kv : cache_tensor_map) { auto cache_tensor = ggml_dup_tensor(cache_ctx, kv.second); @@ -1605,7 +1605,7 @@ struct GGMLRunner { } size_t num_tensors = ggml_tensor_num(cache_ctx); cache_buffer = ggml_backend_alloc_ctx_tensors(cache_ctx, runtime_backend); - GGML_ASSERT(cache_buffer != NULL); + GGML_ASSERT(cache_buffer != nullptr); for (auto kv : runtime_tensor_to_cache_tensor) { ggml_backend_tensor_copy(kv.first, kv.second); } @@ -1637,12 +1637,12 @@ struct GGMLRunner { if (params_on_runtime_backend) { return true; } - GGML_ASSERT(runtime_params_buffer == NULL); + GGML_ASSERT(runtime_params_buffer == nullptr); int64_t t0 = ggml_time_ms(); size_t num_tensors = ggml_tensor_num(offload_ctx); if (num_tensors == 0) { - for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) { - GGML_ASSERT(t->view_src == NULL); + for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) { + GGML_ASSERT(t->view_src == nullptr); ggml_dup_tensor(offload_ctx, t); } } @@ -1651,7 +1651,7 @@ struct GGMLRunner { runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend); - if (runtime_params_buffer == NULL) { + if (runtime_params_buffer == nullptr) { LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i", get_desc().c_str(), num_tensors); @@ -1661,7 +1661,7 @@ struct GGMLRunner { ggml_tensor* t = ggml_get_first_tensor(params_ctx); ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx); - while (t != NULL && offload_t != NULL) { + while (t != nullptr && offload_t != nullptr) { ggml_backend_tensor_copy(t, offload_t); std::swap(t->buffer, offload_t->buffer); std::swap(t->data, offload_t->data); @@ -1693,21 +1693,21 @@ struct GGMLRunner { ggml_tensor* t = ggml_get_first_tensor(params_ctx); ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx); - while (t != NULL && offload_t != NULL) { + while (t != nullptr && offload_t != nullptr) { t->buffer = offload_t->buffer; t->data = offload_t->data; t->extra = offload_t->extra; - offload_t->buffer = NULL; - offload_t->data = NULL; - offload_t->extra = NULL; + offload_t->buffer = nullptr; + offload_t->data = nullptr; + offload_t->extra = nullptr; t = ggml_get_next_tensor(params_ctx, t); offload_t = ggml_get_next_tensor(offload_ctx, offload_t); } - if (runtime_params_buffer != NULL) { + if (runtime_params_buffer != nullptr) { ggml_backend_buffer_free(runtime_params_buffer); - runtime_params_buffer = NULL; + runtime_params_buffer = nullptr; } params_on_runtime_backend = false; } @@ -1744,7 +1744,7 @@ struct GGMLRunner { bool alloc_params_buffer() { size_t num_tensors = ggml_tensor_num(params_ctx); params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend); - if (params_buffer == NULL) { + if (params_buffer == nullptr) { LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", get_desc().c_str(), num_tensors); @@ -1760,14 +1760,14 @@ struct GGMLRunner { } void free_params_buffer() { - if (params_buffer != NULL) { + if (params_buffer != nullptr) { ggml_backend_buffer_free(params_buffer); - params_buffer = NULL; + params_buffer = nullptr; } } size_t get_params_buffer_size() { - if (params_buffer != NULL) { + if (params_buffer != nullptr) { return ggml_backend_buffer_get_size(params_buffer); } return 0; @@ -1779,9 +1779,9 @@ struct GGMLRunner { } void free_compute_buffer() { - if (compute_allocr != NULL) { + if (compute_allocr != nullptr) { ggml_gallocr_free(compute_allocr); - compute_allocr = NULL; + compute_allocr = nullptr; } offload_params_to_params_backend(); } @@ -1792,12 +1792,12 @@ struct GGMLRunner { } struct ggml_tensor* to_backend(struct ggml_tensor* tensor) { - GGML_ASSERT(compute_ctx != NULL); - if (tensor == NULL) { - return NULL; + GGML_ASSERT(compute_ctx != nullptr); + if (tensor == nullptr) { + return nullptr; } // it's performing a compute, check if backend isn't cpu - if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) { + if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) { // pass input tensors to gpu memory auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor); @@ -1813,8 +1813,8 @@ struct GGMLRunner { } struct ggml_tensor* get_cache_tensor_by_name(const std::string& name) { - if (cache_ctx == NULL) { - return NULL; + if (cache_ctx == nullptr) { + return nullptr; } return ggml_get_tensor(cache_ctx, name.c_str()); } @@ -1822,8 +1822,8 @@ struct GGMLRunner { void compute(get_graph_cb_t get_graph, int n_threads, bool free_compute_buffer_immediately = true, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) { if (!offload_params_to_runtime_backend()) { LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str()); return; @@ -1842,12 +1842,12 @@ struct GGMLRunner { ggml_graph_print(gf); #endif copy_cache_tensors_to_cache_buffer(); - if (output != NULL) { + if (output != nullptr) { auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str()); - if (*output == NULL && output_ctx != NULL) { + if (*output == nullptr && output_ctx != nullptr) { *output = ggml_dup_tensor(output_ctx, result); } - if (*output != NULL) { + if (*output != nullptr) { ggml_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output)); } } @@ -1994,7 +1994,7 @@ class Linear : public UnaryBlock { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = NULL; + struct ggml_tensor* b = nullptr; if (bias) { b = params["bias"]; } @@ -2098,7 +2098,7 @@ class Conv2d : public UnaryBlock { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = NULL; + struct ggml_tensor* b = nullptr; if (bias) { b = params["bias"]; } @@ -2156,7 +2156,7 @@ class Conv3dnx1x1 : public UnaryBlock { // result: [N, OC, OD, OH*OW] struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = NULL; + struct ggml_tensor* b = nullptr; if (bias) { b = params["bias"]; } @@ -2205,7 +2205,7 @@ class Conv3d : public UnaryBlock { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = NULL; + struct ggml_tensor* b = nullptr; if (bias) { b = params["bias"]; } @@ -2245,8 +2245,8 @@ class LayerNorm : public UnaryBlock { bias(bias) {} struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = NULL; - struct ggml_tensor* b = NULL; + struct ggml_tensor* w = nullptr; + struct ggml_tensor* b = nullptr; if (elementwise_affine) { w = params["weight"]; @@ -2285,8 +2285,8 @@ class GroupNorm : public GGMLBlock { affine(affine) {} struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = NULL; - struct ggml_tensor* b = NULL; + struct ggml_tensor* w = nullptr; + struct ggml_tensor* b = nullptr; if (affine) { w = params["weight"]; b = params["bias"]; @@ -2369,7 +2369,7 @@ class MultiheadAttention : public GGMLBlock { struct ggml_tensor* k = k_proj->forward(ctx, x); struct ggml_tensor* v = v_proj->forward(ctx, x); - x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, mask); // [N, n_token, embed_dim] + x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, nullptr, mask); // [N, n_token, embed_dim] x = out_proj->forward(ctx, x); // [N, n_token, embed_dim] return x; diff --git a/lora.hpp b/lora.hpp index 1fce9569..0d403d58 100644 --- a/lora.hpp +++ b/lora.hpp @@ -100,7 +100,7 @@ struct LoraModel : public GGMLRunner { bool load_failed = false; bool applied = false; std::vector zero_index_vec = {0}; - ggml_tensor* zero_index = NULL; + ggml_tensor* zero_index = nullptr; enum lora_t type = REGULAR; LoraModel(ggml_backend_t backend, @@ -112,7 +112,7 @@ struct LoraModel : public GGMLRunner { } } - std::string get_desc() { + std::string get_desc() override { return "lora"; } @@ -287,7 +287,7 @@ struct LoraModel : public GGMLRunner { if (is_qkvm_split) { key = key.substr(sizeof("SPLIT_L|") - 1); } - struct ggml_tensor* updown = NULL; + struct ggml_tensor* updown = nullptr; float scale_value = 1.0f; std::string full_key = lora_pre[type] + key; if (is_bias) { @@ -314,13 +314,13 @@ struct LoraModel : public GGMLRunner { } std::string alpha_name = ""; - ggml_tensor* hada_1_mid = NULL; // tau for tucker decomposition - ggml_tensor* hada_1_up = NULL; - ggml_tensor* hada_1_down = NULL; + ggml_tensor* hada_1_mid = nullptr; // tau for tucker decomposition + ggml_tensor* hada_1_up = nullptr; + ggml_tensor* hada_1_down = nullptr; - ggml_tensor* hada_2_mid = NULL; // tau for tucker decomposition - ggml_tensor* hada_2_up = NULL; - ggml_tensor* hada_2_down = NULL; + ggml_tensor* hada_2_mid = nullptr; // tau for tucker decomposition + ggml_tensor* hada_2_up = nullptr; + ggml_tensor* hada_2_down = nullptr; std::string hada_1_mid_name = ""; std::string hada_1_down_name = ""; @@ -368,7 +368,7 @@ struct LoraModel : public GGMLRunner { applied_lora_tensors.insert(hada_2_up_name); applied_lora_tensors.insert(alpha_name); - if (hada_1_up == NULL || hada_1_down == NULL || hada_2_up == NULL || hada_2_down == NULL) { + if (hada_1_up == nullptr || hada_1_down == nullptr || hada_2_up == nullptr || hada_2_down == nullptr) { continue; } @@ -394,8 +394,8 @@ struct LoraModel : public GGMLRunner { std::string alpha_name = full_key + ".alpha"; - ggml_tensor* lokr_w1 = NULL; - ggml_tensor* lokr_w2 = NULL; + ggml_tensor* lokr_w1 = nullptr; + ggml_tensor* lokr_w2 = nullptr; std::string lokr_w1_name = ""; std::string lokr_w2_name = ""; @@ -407,8 +407,8 @@ struct LoraModel : public GGMLRunner { lokr_w1 = to_f32(compute_ctx, lora_tensors[lokr_w1_name]); applied_lora_tensors.insert(lokr_w1_name); } else { - ggml_tensor* down = NULL; - ggml_tensor* up = NULL; + ggml_tensor* down = nullptr; + ggml_tensor* up = nullptr; std::string down_name = lokr_w1_name + "_b"; std::string up_name = lokr_w1_name + "_a"; if (lora_tensors.find(down_name) != lora_tensors.end()) { @@ -432,8 +432,8 @@ struct LoraModel : public GGMLRunner { lokr_w2 = to_f32(compute_ctx, lora_tensors[lokr_w2_name]); applied_lora_tensors.insert(lokr_w2_name); } else { - ggml_tensor* down = NULL; - ggml_tensor* up = NULL; + ggml_tensor* down = nullptr; + ggml_tensor* up = nullptr; std::string down_name = lokr_w2_name + "_b"; std::string up_name = lokr_w2_name + "_a"; if (lora_tensors.find(down_name) != lora_tensors.end()) { @@ -460,9 +460,9 @@ struct LoraModel : public GGMLRunner { } else { // LoRA mode - ggml_tensor* lora_mid = NULL; // tau for tucker decomposition - ggml_tensor* lora_up = NULL; - ggml_tensor* lora_down = NULL; + ggml_tensor* lora_mid = nullptr; // tau for tucker decomposition + ggml_tensor* lora_up = nullptr; + ggml_tensor* lora_down = nullptr; std::string alpha_name = ""; std::string scale_name = ""; @@ -497,12 +497,12 @@ struct LoraModel : public GGMLRunner { auto split_k_alpha_name = full_key + "k" + suffix + ".alpha"; auto split_v_alpha_name = full_key + "v" + suffix + ".alpha"; - ggml_tensor* lora_q_down = NULL; - ggml_tensor* lora_q_up = NULL; - ggml_tensor* lora_k_down = NULL; - ggml_tensor* lora_k_up = NULL; - ggml_tensor* lora_v_down = NULL; - ggml_tensor* lora_v_up = NULL; + ggml_tensor* lora_q_down = nullptr; + ggml_tensor* lora_q_up = nullptr; + ggml_tensor* lora_k_down = nullptr; + ggml_tensor* lora_k_up = nullptr; + ggml_tensor* lora_v_down = nullptr; + ggml_tensor* lora_v_up = nullptr; lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]); @@ -633,15 +633,15 @@ struct LoraModel : public GGMLRunner { auto split_v_alpha_name = full_key + "attn.to_v" + ".alpha"; auto split_m_alpha_name = full_key + "proj_mlp" + ".alpha"; - ggml_tensor* lora_q_down = NULL; - ggml_tensor* lora_q_up = NULL; - ggml_tensor* lora_k_down = NULL; - ggml_tensor* lora_k_up = NULL; - ggml_tensor* lora_v_down = NULL; - ggml_tensor* lora_v_up = NULL; + ggml_tensor* lora_q_down = nullptr; + ggml_tensor* lora_q_up = nullptr; + ggml_tensor* lora_k_down = nullptr; + ggml_tensor* lora_k_up = nullptr; + ggml_tensor* lora_v_down = nullptr; + ggml_tensor* lora_v_up = nullptr; - ggml_tensor* lora_m_down = NULL; - ggml_tensor* lora_m_up = NULL; + ggml_tensor* lora_m_down = nullptr; + ggml_tensor* lora_m_up = nullptr; lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]); @@ -809,7 +809,7 @@ struct LoraModel : public GGMLRunner { } } - if (lora_up == NULL || lora_down == NULL) { + if (lora_up == nullptr || lora_down == nullptr) { continue; } // calc_scale diff --git a/ltxv.hpp b/ltxv.hpp index 6ff66811..fdd190f0 100644 --- a/ltxv.hpp +++ b/ltxv.hpp @@ -13,10 +13,10 @@ namespace LTXV { public: CausalConv3d(int64_t in_channels, int64_t out_channels, - int kernel_size = 3, - std::tuple stride = {1, 1, 1}, - int dilation = 1, - bool bias = true) { + int kernel_size = 3, + std::tuple stride = {1, 1, 1}, + int dilation = 1, + bool bias = true) { time_kernel_size = kernel_size / 2; blocks["conv"] = std::shared_ptr(new Conv3d(in_channels, out_channels, diff --git a/mmdit.hpp b/mmdit.hpp index d9d19340..8442592a 100644 --- a/mmdit.hpp +++ b/mmdit.hpp @@ -1,6 +1,8 @@ #ifndef __MMDIT_HPP__ #define __MMDIT_HPP__ +#include + #include "ggml_extend.hpp" #include "model.h" @@ -208,8 +210,8 @@ class SelfAttention : public GGMLBlock { ggml_backend_t backend, struct ggml_tensor* x) { auto qkv = pre_attention(ctx, x); - x = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, true); // [N, n_token, dim] - x = post_attention(ctx, x); // [N, n_token, dim] + x = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, true); // [N, n_token, dim] + x = post_attention(ctx, x); // [N, n_token, dim] return x; } }; @@ -347,7 +349,7 @@ struct DismantledBlock : public GGMLBlock { auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa); auto qkv = attn->pre_attention(ctx, attn_in); - return {qkv, {NULL, NULL, NULL, NULL, NULL}}; + return {qkv, {nullptr, nullptr, nullptr, nullptr, nullptr}}; } } @@ -439,8 +441,8 @@ struct DismantledBlock : public GGMLBlock { auto qkv2 = std::get<1>(qkv_intermediates); auto intermediates = std::get<2>(qkv_intermediates); - auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn); // [N, n_token, dim] - auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, NULL, false, false, flash_attn); // [N, n_token, dim] + auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] + auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] x = post_attention_x(ctx, attn_out, attn2_out, @@ -456,7 +458,7 @@ struct DismantledBlock : public GGMLBlock { auto qkv = qkv_intermediates.first; auto intermediates = qkv_intermediates.second; - auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn); // [N, n_token, dim] + auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] x = post_attention(ctx, attn_out, intermediates[0], @@ -502,8 +504,8 @@ block_mixing(struct ggml_context* ctx, qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1)); } - auto attn = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, NULL, false, false, flash_attn); // [N, n_context + n_token, hidden_size] - attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size] + auto attn = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, flash_attn); // [N, n_context + n_token, hidden_size] + attn = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3)); // [n_context + n_token, N, hidden_size] auto context_attn = ggml_view_3d(ctx, attn, attn->ne[0], @@ -532,7 +534,7 @@ block_mixing(struct ggml_context* ctx, context_intermediates[3], context_intermediates[4]); } else { - context = NULL; + context = nullptr; } if (x_block->self_attn) { @@ -645,7 +647,7 @@ struct MMDiT : public GGMLBlock { std::string qk_norm; bool flash_attn = false; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override { enum ggml_type wtype = GGML_TYPE_F32; params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1); } @@ -823,8 +825,8 @@ struct MMDiT : public GGMLBlock { ggml_backend_t backend, struct ggml_tensor* x, struct ggml_tensor* t, - struct ggml_tensor* y = NULL, - struct ggml_tensor* context = NULL, + struct ggml_tensor* y = nullptr, + struct ggml_tensor* context = nullptr, std::vector skip_layers = std::vector()) { // Forward pass of DiT. // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) @@ -843,14 +845,14 @@ struct MMDiT : public GGMLBlock { x = ggml_add(ctx, patch_embed, pos_embed); // [N, H*W, hidden_size] auto c = t_embedder->forward(ctx, t); // [N, hidden_size] - if (y != NULL && adm_in_channels != -1) { + if (y != nullptr && adm_in_channels != -1) { auto y_embedder = std::dynamic_pointer_cast(blocks["y_embedder"]); y = y_embedder->forward(ctx, y); // [N, hidden_size] c = ggml_add(ctx, c, y); } - if (context != NULL) { + if (context != nullptr) { auto context_embedder = std::dynamic_pointer_cast(blocks["context_embedder"]); context = context_embedder->forward(ctx, context); // [N, L, D] aka [N, L, 1536] @@ -875,7 +877,7 @@ struct MMDiTRunner : public GGMLRunner { mmdit.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return "mmdit"; } @@ -913,8 +915,8 @@ struct MMDiTRunner : public GGMLRunner { struct ggml_tensor* timesteps, struct ggml_tensor* context, struct ggml_tensor* y, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL, + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr, std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] @@ -930,11 +932,11 @@ struct MMDiTRunner : public GGMLRunner { void test() { struct ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); { // cpu f16: pass @@ -955,7 +957,7 @@ struct MMDiTRunner : public GGMLRunner { ggml_set_f32(y, 0.01f); // print_ggml_tensor(y); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); compute(8, x, timesteps, context, y, &out, work_ctx); @@ -970,7 +972,7 @@ struct MMDiTRunner : public GGMLRunner { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr mmdit = std::shared_ptr(new MMDiTRunner(backend, false, false)); + std::shared_ptr mmdit = std::make_shared(backend, false, false); { LOG_INFO("loading from '%s'", file_path.c_str()); diff --git a/model.cpp b/model.cpp index b45493cc..b877915c 100644 --- a/model.cpp +++ b/model.cpp @@ -1,7 +1,7 @@ -#include #include #include #include +#include #include #include #include @@ -869,7 +869,6 @@ uint16_t f8_e5m2_to_f16(uint8_t fp8) { } if (exponent == 0) { // subnormal numbers - fp16_exponent = 0; fp16_mantissa = (mantissa << 8); return fp16_sign | fp16_mantissa; } @@ -948,7 +947,7 @@ void convert_tensor(void* src, ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n); } else { auto qtype = ggml_get_type_traits(src_type); - if (qtype->to_float == NULL) { + if (qtype->to_float == nullptr) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(src_type))); } @@ -958,7 +957,7 @@ void convert_tensor(void* src, // src_type == GGML_TYPE_F16 => dst_type is quantized // src_type is quantized => dst_type == GGML_TYPE_F16 or dst_type is quantized auto qtype = ggml_get_type_traits(src_type); - if (qtype->to_float == NULL) { + if (qtype->to_float == nullptr) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(src_type))); } @@ -1020,7 +1019,7 @@ std::map unicode_to_byte() { bool is_zip_file(const std::string& file_path) { struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); - if (zip == NULL) { + if (zip == nullptr) { return false; } zip_close(zip); @@ -1116,8 +1115,8 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; - gguf_context* ctx_gguf_ = NULL; - ggml_context* ctx_meta_ = NULL; + gguf_context* ctx_gguf_ = nullptr; + ggml_context* ctx_meta_ = nullptr; ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_}); if (!ctx_gguf_) { @@ -1726,7 +1725,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s size_t file_index = file_paths_.size() - 1; struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); - if (zip == NULL) { + if (zip == nullptr) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return false; } @@ -1739,7 +1738,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s if (pos != std::string::npos) { std::string dir = name.substr(0, pos); printf("ZIP %d, name = %s, dir = %s \n", i, name.c_str(), dir.c_str()); - void* pkl_data = NULL; + void* pkl_data = nullptr; size_t pkl_size; zip_entry_read(zip, &pkl_data, &pkl_size); @@ -1892,24 +1891,25 @@ SDVersion ModelLoader::get_sd_version() { return VERSION_COUNT; } -ggml_type ModelLoader::get_sd_wtype() { +std::map ModelLoader::get_wtype_stat() { + std::map wtype_stat; for (auto& tensor_storage : tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { continue; } - if (ggml_is_quantized(tensor_storage.type)) { - return tensor_storage.type; - } - - if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) { - return tensor_storage.type; + auto iter = wtype_stat.find(tensor_storage.type); + if (iter != wtype_stat.end()) { + iter->second++; + } else { + wtype_stat[tensor_storage.type] = 1; } } - return GGML_TYPE_COUNT; + return wtype_stat; } -ggml_type ModelLoader::get_conditioner_wtype() { +std::map ModelLoader::get_conditioner_wtype_stat() { + std::map wtype_stat; for (auto& tensor_storage : tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { continue; @@ -1922,18 +1922,18 @@ ggml_type ModelLoader::get_conditioner_wtype() { continue; } - if (ggml_is_quantized(tensor_storage.type)) { - return tensor_storage.type; - } - - if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) { - return tensor_storage.type; + auto iter = wtype_stat.find(tensor_storage.type); + if (iter != wtype_stat.end()) { + iter->second++; + } else { + wtype_stat[tensor_storage.type] = 1; } } - return GGML_TYPE_COUNT; + return wtype_stat; } -ggml_type ModelLoader::get_diffusion_model_wtype() { +std::map ModelLoader::get_diffusion_model_wtype_stat() { + std::map wtype_stat; for (auto& tensor_storage : tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { continue; @@ -1943,18 +1943,18 @@ ggml_type ModelLoader::get_diffusion_model_wtype() { continue; } - if (ggml_is_quantized(tensor_storage.type)) { - return tensor_storage.type; - } - - if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) { - return tensor_storage.type; + auto iter = wtype_stat.find(tensor_storage.type); + if (iter != wtype_stat.end()) { + iter->second++; + } else { + wtype_stat[tensor_storage.type] = 1; } } - return GGML_TYPE_COUNT; + return wtype_stat; } -ggml_type ModelLoader::get_vae_wtype() { +std::map ModelLoader::get_vae_wtype_stat() { + std::map wtype_stat; for (auto& tensor_storage : tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { continue; @@ -1965,15 +1965,14 @@ ggml_type ModelLoader::get_vae_wtype() { continue; } - if (ggml_is_quantized(tensor_storage.type)) { - return tensor_storage.type; - } - - if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) { - return tensor_storage.type; + auto iter = wtype_stat.find(tensor_storage.type); + if (iter != wtype_stat.end()) { + iter->second++; + } else { + wtype_stat[tensor_storage.type] = 1; } } - return GGML_TYPE_COUNT; + return wtype_stat; } void ModelLoader::set_wtype_override(ggml_type wtype, std::string prefix) { @@ -2144,10 +2143,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread for (int i = 0; i < n_threads; ++i) { workers.emplace_back([&, file_path, is_zip]() { std::ifstream file; - struct zip_t* zip = NULL; + struct zip_t* zip = nullptr; if (is_zip) { zip = zip_open(file_path.c_str(), 0, 'r'); - if (zip == NULL) { + if (zip == nullptr) { LOG_ERROR("failed to open zip '%s'", file_path.c_str()); failed = true; return; @@ -2172,7 +2171,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } const TensorStorage& tensor_storage = *file_tensors[idx]; - ggml_tensor* dst_tensor = NULL; + ggml_tensor* dst_tensor = nullptr; t0 = ggml_time_ms(); @@ -2182,7 +2181,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread break; } - if (dst_tensor == NULL) { + if (dst_tensor == nullptr) { t1 = ggml_time_ms(); read_time_ms.fetch_add(t1 - t0); continue; @@ -2191,7 +2190,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread size_t nbytes_to_read = tensor_storage.nbytes_to_read(); auto read_data = [&](char* buf, size_t n) { - if (zip != NULL) { + if (zip != nullptr) { zip_entry_openbyindex(zip, tensor_storage.index_in_zip); size_t entry_size = zip_entry_size(zip); if (entry_size != n) { @@ -2215,7 +2214,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } }; - if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) { + if (dst_tensor->buffer == nullptr || ggml_backend_buffer_is_host(dst_tensor->buffer)) { if (tensor_storage.type == dst_tensor->type) { GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); if (tensor_storage.is_f64 || tensor_storage.is_i64) { @@ -2317,7 +2316,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } } } - if (zip != NULL) { + if (zip != nullptr) { zip_close(zip); } }); @@ -2507,7 +2506,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type mem_size += tensor_storages.size() * ggml_tensor_overhead(); mem_size += get_params_mem_size(backend, type); LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f); - ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false}); + ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false}); gguf_context* gguf_ctx = gguf_init_empty(); @@ -2533,7 +2532,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type std::lock_guard lock(tensor_mutex); ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); - if (tensor == NULL) { + if (tensor == nullptr) { LOG_ERROR("ggml_new_tensor failed"); return false; } @@ -2566,7 +2565,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) { size_t alignment = 128; - if (backend != NULL) { + if (backend != nullptr) { alignment = ggml_backend_get_alignment(backend); } int64_t mem_size = 0; @@ -2596,7 +2595,7 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa return false; } - if (vae_path != NULL && strlen(vae_path) > 0) { + if (vae_path != nullptr && strlen(vae_path) > 0) { if (!model_loader.init_from_file(vae_path, "vae.")) { LOG_ERROR("init model loader from file failed: '%s'", vae_path); return false; diff --git a/model.h b/model.h index 069bb0c2..fe77a219 100644 --- a/model.h +++ b/model.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "ggml-backend.h" @@ -140,8 +141,8 @@ struct TensorStorage { TensorStorage() = default; - TensorStorage(const std::string& name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0) - : name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) { + TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0) + : name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) { for (int i = 0; i < n_dims; i++) { this->ne[i] = ne[i]; } @@ -259,10 +260,10 @@ class ModelLoader { bool init_from_file(const std::string& file_path, const std::string& prefix = ""); bool model_is_unet(); SDVersion get_sd_version(); - ggml_type get_sd_wtype(); - ggml_type get_conditioner_wtype(); - ggml_type get_diffusion_model_wtype(); - ggml_type get_vae_wtype(); + std::map get_wtype_stat(); + std::map get_conditioner_wtype_stat(); + std::map get_diffusion_model_wtype_stat(); + std::map get_vae_wtype_stat(); void set_wtype_override(ggml_type wtype, std::string prefix = ""); bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0); bool load_tensors(std::map& tensors, diff --git a/pmid.hpp b/pmid.hpp index 63029cbc..5ad7096a 100644 --- a/pmid.hpp +++ b/pmid.hpp @@ -472,8 +472,8 @@ struct PhotoMakerIDEncoder : public GGMLRunner { struct ggml_tensor* prompt_embeds_d = to_backend(prompt_embeds); struct ggml_tensor* id_embeds_d = to_backend(id_embeds); - struct ggml_tensor* left = NULL; - struct ggml_tensor* right = NULL; + struct ggml_tensor* left = nullptr; + struct ggml_tensor* right = nullptr; for (int i = 0; i < class_tokens_mask.size(); i++) { if (class_tokens_mask[i]) { // printf(" 1,"); @@ -528,7 +528,7 @@ struct PhotoMakerIDEncoder : public GGMLRunner { } } } - struct ggml_tensor* updated_prompt_embeds = NULL; + struct ggml_tensor* updated_prompt_embeds = nullptr; if (pm_version == PM_VERSION_1) updated_prompt_embeds = id_encoder.forward(ctx0, runtime_backend, @@ -638,7 +638,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner { pos = tensors.find("pmid.id_embeds"); if (pos != tensors.end()) return pos->second; - return NULL; + return nullptr; } }; diff --git a/preprocessing.hpp b/preprocessing.hpp index 552aa642..11c3a21b 100644 --- a/preprocessing.hpp +++ b/preprocessing.hpp @@ -7,7 +7,7 @@ void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) { struct ggml_init_params params; params.mem_size = 80 * input->ne[0] * input->ne[1]; // 20M for 512x512 - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* ctx0 = ggml_init(params); struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1); @@ -165,7 +165,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) { struct ggml_init_params params; params.mem_size = static_cast(40 * img.width * img.height); // 10MB for 512x512 - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); diff --git a/qwen_image.hpp b/qwen_image.hpp index ce4e62dc..2d3cd230 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -1,6 +1,8 @@ #ifndef __QWEN_IMAGE_HPP__ #define __QWEN_IMAGE_HPP__ +#include + #include "common.hpp" #include "flux.hpp" #include "ggml_extend.hpp" @@ -534,12 +536,12 @@ namespace Qwen { continue; } } - LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers); + LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers); qwen_image = QwenImageModel(qwen_image_params); qwen_image.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return "qwen_image"; } @@ -577,7 +579,7 @@ namespace Qwen { auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len); // pe->data = pe_vec.data(); // print_ggml_tensor(pe, true, "pe"); - // pe->data = NULL; + // pe->data = nullptr; set_backend_tensor_data(pe, pe_vec.data()); struct ggml_tensor* out = qwen_image.forward(compute_ctx, @@ -599,8 +601,8 @@ namespace Qwen { struct ggml_tensor* context, std::vector ref_latents = {}, bool increase_ref_index = false, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] @@ -614,11 +616,11 @@ namespace Qwen { void test() { struct ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1GB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); { // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1); @@ -634,7 +636,7 @@ namespace Qwen { auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin"); print_ggml_tensor(context); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); compute(8, x, timesteps, context, {}, false, &out, work_ctx); @@ -666,12 +668,12 @@ namespace Qwen { } } - std::shared_ptr qwen_image = std::shared_ptr(new QwenImageRunner(backend, - false, - tensor_types, - "model.diffusion_model", - VERSION_QWEN_IMAGE, - true)); + std::shared_ptr qwen_image = std::make_shared(backend, + false, + tensor_types, + "model.diffusion_model", + VERSION_QWEN_IMAGE, + true); qwen_image->alloc_params_buffer(); std::map tensors; diff --git a/qwenvl.hpp b/qwenvl.hpp index 881f54d7..ab04435a 100644 --- a/qwenvl.hpp +++ b/qwenvl.hpp @@ -5,11 +5,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include "clip.hpp" @@ -589,7 +591,7 @@ namespace Qwen { int64_t window_size, std::set fullatt_block_indexes = {7, 15, 23, 31}, float eps = 1e-6f) - : num_layers(num_layers), fullatt_block_indexes(fullatt_block_indexes), spatial_merge_size(spatial_merge_size) { + : num_layers(num_layers), fullatt_block_indexes(std::move(fullatt_block_indexes)), spatial_merge_size(spatial_merge_size) { blocks["patch_embed"] = std::shared_ptr(new Qwen2_5_VisionPatchEmbed(llama_cpp_style, patch_size, temporal_patch_size, @@ -949,7 +951,7 @@ namespace Qwen { model.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return "qwenvl2.5"; } @@ -1011,7 +1013,7 @@ namespace Qwen { struct ggml_tensor* input_ids, std::vector> image_embeds, ggml_tensor** output, - ggml_context* output_ctx = NULL) { + ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(input_ids, image_embeds); }; @@ -1162,7 +1164,7 @@ namespace Qwen { auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len); // pe->data = pe_vec.data(); // print_ggml_tensor(pe); - // pe->data = NULL; + // pe->data = nullptr; set_backend_tensor_data(pe, pe_vec.data()); struct ggml_tensor* hidden_states = vision_forward(compute_ctx, @@ -1180,7 +1182,7 @@ namespace Qwen { void encode_image(const int n_threads, struct ggml_tensor* image, ggml_tensor** output, - ggml_context* output_ctx = NULL) { + ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_encode_image_graph(image); }; @@ -1246,11 +1248,11 @@ namespace Qwen { void test() { struct ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1GB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); bool test_vit = true; bool test_decoder_with_vit = true; @@ -1259,7 +1261,7 @@ namespace Qwen { { auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); print_ggml_tensor(image, false, "image"); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); model.encode_image(8, image, &out, work_ctx); @@ -1295,7 +1297,7 @@ namespace Qwen { } printf("\n"); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); model.compute(8, input_ids, image_embeds, &out, work_ctx); @@ -1308,7 +1310,7 @@ namespace Qwen { // ggml_set_f32(image, 0.f); auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); print_ggml_tensor(image, false, "image"); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); model.encode_image(8, image, &out, work_ctx); @@ -1330,7 +1332,7 @@ namespace Qwen { } printf("\n"); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); model.compute(8, input_ids, {}, &out, work_ctx); @@ -1361,11 +1363,11 @@ namespace Qwen { } } - std::shared_ptr qwenvl = std::shared_ptr(new Qwen2_5_VLEmbedder(backend, - false, - tensor_types, - "qwen2vl", - true)); + std::shared_ptr qwenvl = std::make_shared(backend, + false, + tensor_types, + "qwen2vl", + true); qwenvl->alloc_params_buffer(); std::map tensors; diff --git a/rng.hpp b/rng.hpp index 3340be61..accc4088 100644 --- a/rng.hpp +++ b/rng.hpp @@ -15,11 +15,11 @@ class STDDefaultRNG : public RNG { std::default_random_engine generator; public: - void manual_seed(uint64_t seed) { + void manual_seed(uint64_t seed) override { generator.seed((unsigned int)seed); } - std::vector randn(uint32_t n) { + std::vector randn(uint32_t n) override { std::vector result; float mean = 0.0; float stddev = 1.0; diff --git a/rng_philox.hpp b/rng_philox.hpp index 33fea9c5..58da0703 100644 --- a/rng_philox.hpp +++ b/rng_philox.hpp @@ -93,12 +93,12 @@ class PhiloxRNG : public RNG { this->offset = 0; } - void manual_seed(uint64_t seed) { + void manual_seed(uint64_t seed) override { this->seed = seed; this->offset = 0; } - std::vector randn(uint32_t n) { + std::vector randn(uint32_t n) override { std::vector> counter(4, std::vector(n, 0)); for (uint32_t i = 0; i < n; i++) { counter[0][i] = this->offset; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 87b6a377..8fb88f48 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -82,14 +82,10 @@ void calculate_alphas_cumprod(float* alphas_cumprod, class StableDiffusionGGML { public: - ggml_backend_t backend = NULL; // general backend - ggml_backend_t clip_backend = NULL; - ggml_backend_t control_net_backend = NULL; - ggml_backend_t vae_backend = NULL; - ggml_type model_wtype = GGML_TYPE_COUNT; - ggml_type conditioner_wtype = GGML_TYPE_COUNT; - ggml_type diffusion_model_wtype = GGML_TYPE_COUNT; - ggml_type vae_wtype = GGML_TYPE_COUNT; + ggml_backend_t backend = nullptr; // general backend + ggml_backend_t clip_backend = nullptr; + ggml_backend_t control_net_backend = nullptr; + ggml_backend_t vae_backend = nullptr; SDVersion version; bool vae_decode_only = false; @@ -105,7 +101,7 @@ class StableDiffusionGGML { std::shared_ptr high_noise_diffusion_model; std::shared_ptr first_stage_model; std::shared_ptr tae_first_stage; - std::shared_ptr control_net = NULL; + std::shared_ptr control_net = nullptr; std::shared_ptr pmid_model; std::shared_ptr pmid_lora; std::shared_ptr pmid_id_embeds; @@ -294,37 +290,33 @@ class StableDiffusionGGML { ggml_type wtype = (int)sd_ctx_params->wtype < std::min(SD_TYPE_COUNT, GGML_TYPE_COUNT) ? (ggml_type)sd_ctx_params->wtype : GGML_TYPE_COUNT; - if (wtype == GGML_TYPE_COUNT) { - model_wtype = model_loader.get_sd_wtype(); - if (model_wtype == GGML_TYPE_COUNT) { - model_wtype = GGML_TYPE_F32; - LOG_WARN("can not get mode wtype frome weight, use f32"); - } - conditioner_wtype = model_loader.get_conditioner_wtype(); - if (conditioner_wtype == GGML_TYPE_COUNT) { - conditioner_wtype = wtype; - } - diffusion_model_wtype = model_loader.get_diffusion_model_wtype(); - if (diffusion_model_wtype == GGML_TYPE_COUNT) { - diffusion_model_wtype = wtype; - } - vae_wtype = model_loader.get_vae_wtype(); - - if (vae_wtype == GGML_TYPE_COUNT) { - vae_wtype = wtype; - } - } else { - model_wtype = wtype; - conditioner_wtype = wtype; - diffusion_model_wtype = wtype; - vae_wtype = wtype; + if (wtype != GGML_TYPE_COUNT) { model_loader.set_wtype_override(wtype); } - LOG_INFO("Weight type: %s", ggml_type_name(model_wtype)); - LOG_INFO("Conditioner weight type: %s", ggml_type_name(conditioner_wtype)); - LOG_INFO("Diffusion model weight type: %s", ggml_type_name(diffusion_model_wtype)); - LOG_INFO("VAE weight type: %s", ggml_type_name(vae_wtype)); + std::map wtype_stat = model_loader.get_wtype_stat(); + std::map conditioner_wtype_stat = model_loader.get_conditioner_wtype_stat(); + std::map diffusion_model_wtype_stat = model_loader.get_diffusion_model_wtype_stat(); + std::map vae_wtype_stat = model_loader.get_vae_wtype_stat(); + + auto wtype_stat_to_str = [](const std::map& m, int key_width = 8, int value_width = 5) -> std::string { + std::ostringstream oss; + bool first = true; + for (const auto& [type, count] : m) { + if (!first) + oss << "|"; + first = false; + oss << std::right << std::setw(key_width) << ggml_type_name(type) + << ": " + << std::left << std::setw(value_width) << count; + } + return oss.str(); + }; + + LOG_INFO("Weight type stat: %s", wtype_stat_to_str(wtype_stat).c_str()); + LOG_INFO("Conditioner weight type stat: %s", wtype_stat_to_str(conditioner_wtype_stat).c_str()); + LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str()); + LOG_INFO("VAE weight type stat: %s", wtype_stat_to_str(vae_wtype_stat).c_str()); LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor)); @@ -536,7 +528,7 @@ class StableDiffusionGGML { // first_stage_model->get_param_tensors(tensors, "first_stage_model."); if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { - ggml_backend_t controlnet_backend = NULL; + ggml_backend_t controlnet_backend = nullptr; if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_DEBUG("ControlNet: Using CPU backend"); controlnet_backend = ggml_backend_cpu_init(); @@ -592,11 +584,11 @@ class StableDiffusionGGML { struct ggml_init_params params; params.mem_size = static_cast(10 * 1024) * 1024; // 10M - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check - GGML_ASSERT(ctx != NULL); + GGML_ASSERT(ctx != nullptr); ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data); @@ -869,8 +861,8 @@ class StableDiffusionGGML { struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); ggml_set_f32(timesteps, 999); - struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL; - if (concat != NULL) { + struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr; + if (concat != nullptr) { ggml_set_f32(concat, 0); } @@ -938,9 +930,6 @@ class StableDiffusionGGML { } void apply_loras(const std::unordered_map& lora_state) { - if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) { - LOG_WARN("In quantized models when applying LoRA, the images have poor quality."); - } std::unordered_map lora_state_diff; for (auto& kv : lora_state) { const std::string& lora_name = kv.first; @@ -987,7 +976,7 @@ class StableDiffusionGGML { ggml_tensor* prompts_embeds, ggml_tensor* id_embeds, std::vector& class_tokens_mask) { - ggml_tensor* res = NULL; + ggml_tensor* res = nullptr; pmid_model->compute(n_threads, init_img, prompts_embeds, id_embeds, class_tokens_mask, &res, work_ctx); return res; } @@ -997,7 +986,7 @@ class StableDiffusionGGML { bool return_pooled = true, int clip_skip = -1, bool zero_out_masked = false) { - ggml_tensor* output = NULL; + ggml_tensor* output = nullptr; if (zero_out_masked) { if (return_pooled) { output = ggml_new_tensor_1d(work_ctx, @@ -1015,12 +1004,12 @@ class StableDiffusionGGML { sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size); free(image.data); - image.data = NULL; + image.data = nullptr; ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); sd_image_f32_to_tensor(resized_image, pixel_values, false); free(resized_image.data); - resized_image.data = NULL; + resized_image.data = nullptr; // print_ggml_tensor(pixel_values); clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx); @@ -1042,7 +1031,7 @@ class StableDiffusionGGML { struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked); // c_concat - struct ggml_tensor* c_concat = NULL; + struct ggml_tensor* c_concat = nullptr; { if (zero_out_masked) { c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1); @@ -1054,10 +1043,10 @@ class StableDiffusionGGML { sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height); free(image.data); - image.data = NULL; + image.data = nullptr; sd_image_f32_to_tensor(resized_image, init_img, false); free(resized_image.data); - resized_image.data = NULL; + resized_image.data = nullptr; } else { sd_image_to_tensor(init_image, init_img); } @@ -1074,7 +1063,7 @@ class StableDiffusionGGML { } // y - struct ggml_tensor* y = NULL; + struct ggml_tensor* y = nullptr; { y = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels()); int out_dim = 256; @@ -1093,7 +1082,7 @@ class StableDiffusionGGML { if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") { auto new_timesteps = std::vector(init_latent->ne[2], timesteps[0]); - if (denoise_mask != NULL) { + if (denoise_mask != nullptr) { float value = ggml_tensor_get_f32(denoise_mask, 0, 0, 0, 0); if (value == 0.f) { new_timesteps[0] = 0.f; @@ -1140,8 +1129,8 @@ class StableDiffusionGGML { SDCondition id_cond, std::vector ref_latents = {}, bool increase_ref_index = false, - ggml_tensor* denoise_mask = NULL, - ggml_tensor* vace_context = NULL, + ggml_tensor* denoise_mask = nullptr, + ggml_tensor* vace_context = nullptr, float vace_strength = 1.f) { if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) { LOG_WARN("timestep shifting is only supported for SDXL models!"); @@ -1168,15 +1157,15 @@ class StableDiffusionGGML { struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x); - bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL; - bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL; + bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr; + bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr; bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; // denoise wrapper struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* out_uncond = NULL; - struct ggml_tensor* out_skip = NULL; - struct ggml_tensor* out_img_cond = NULL; + struct ggml_tensor* out_uncond = nullptr; + struct ggml_tensor* out_skip = nullptr; + struct ggml_tensor* out_img_cond = nullptr; if (has_unconditioned) { out_uncond = ggml_dup_tensor(work_ctx, x); @@ -1234,7 +1223,7 @@ class StableDiffusionGGML { std::vector controls; - if (control_hint != NULL && control_net != NULL) { + if (control_hint != nullptr && control_net != nullptr) { control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector); controls = control_net->controls; // print_ggml_tensor(controls[12]); @@ -1269,10 +1258,10 @@ class StableDiffusionGGML { &out_cond); } - float* negative_data = NULL; + float* negative_data = nullptr; if (has_unconditioned) { // uncond - if (control_hint != NULL && control_net != NULL) { + if (control_hint != nullptr && control_net != nullptr) { control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector); controls = control_net->controls; } @@ -1286,7 +1275,7 @@ class StableDiffusionGGML { negative_data = (float*)out_uncond->data; } - float* img_cond_data = NULL; + float* img_cond_data = nullptr; if (has_img_cond) { diffusion_params.context = img_cond.c_crossattn; diffusion_params.c_concat = img_cond.c_concat; @@ -1299,7 +1288,7 @@ class StableDiffusionGGML { int step_count = sigmas.size(); bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count); - float* skip_layer_data = NULL; + float* skip_layer_data = nullptr; if (is_skiplayer_step) { LOG_DEBUG("Skipping layers at step %d\n", step); // skip layer (same as conditionned) @@ -1490,7 +1479,7 @@ class StableDiffusionGGML { ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) { int64_t t0 = ggml_time_ms(); - ggml_tensor* result = NULL; + ggml_tensor* result = nullptr; int W = x->ne[0] / 8; int H = x->ne[1] / 8; if (vae_tiling_params.enabled && !encode_video) { @@ -1537,7 +1526,7 @@ class StableDiffusionGGML { if (vae_tiling_params.enabled && !encode_video) { // split latent in 32x32 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - tae_first_stage->compute(n_threads, in, false, &out, NULL); + tae_first_stage->compute(n_threads, in, false, &out, nullptr); }; sd_tiling(x, result, 8, 64, 0.5f, on_tiling); } else { @@ -1612,7 +1601,7 @@ class StableDiffusionGGML { int64_t W = x->ne[0] * 8; int64_t H = x->ne[1] * 8; int64_t C = 3; - ggml_tensor* result = NULL; + ggml_tensor* result = nullptr; if (decode_video) { int T = x->ne[2]; if (sd_version_is_wan(version)) { @@ -1652,7 +1641,7 @@ class StableDiffusionGGML { // split latent in 32x32 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - first_stage_model->compute(n_threads, in, true, &out, NULL); + first_stage_model->compute(n_threads, in, true, &out, nullptr); }; sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, tile_overlap, on_tiling); } else { @@ -1829,7 +1818,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { char* buf = (char*)malloc(4096); if (!buf) - return NULL; + return nullptr; buf[0] = '\0'; snprintf(buf + strlen(buf), 4096 - strlen(buf), @@ -1849,7 +1838,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "embedding_dir: %s\n" "photo_maker_path: %s\n" "vae_decode_only: %s\n" - "vae_tiling: %s\n" "free_params_immediately: %s\n" "n_threads: %d\n" "wtype: %s\n" @@ -1913,7 +1901,7 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) { char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { char* buf = (char*)malloc(4096); if (!buf) - return NULL; + return nullptr; buf[0] = '\0'; snprintf(buf + strlen(buf), 4096 - strlen(buf), @@ -1965,7 +1953,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { char* buf = (char*)malloc(4096); if (!buf) - return NULL; + return nullptr; buf[0] = '\0'; char* sample_params_str = sd_sample_params_to_str(&sd_img_gen_params->sample_params); @@ -1981,6 +1969,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { "seed: %" PRId64 "batch_count: %d\n" "ref_images_count: %d\n" + "auto_resize_ref_image: %s\n" "increase_ref_index: %s\n" "control_strength: %.2f\n" "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n" @@ -1995,6 +1984,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->seed, sd_img_gen_params->batch_count, sd_img_gen_params->ref_images_count, + BOOL_STR(sd_img_gen_params->auto_resize_ref_image), BOOL_STR(sd_img_gen_params->increase_ref_index), sd_img_gen_params->control_strength, sd_img_gen_params->pm_params.style_strength, @@ -2020,40 +2010,40 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { } struct sd_ctx_t { - StableDiffusionGGML* sd = NULL; + StableDiffusionGGML* sd = nullptr; }; sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) { sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); - if (sd_ctx == NULL) { - return NULL; + if (sd_ctx == nullptr) { + return nullptr; } sd_ctx->sd = new StableDiffusionGGML(); - if (sd_ctx->sd == NULL) { + if (sd_ctx->sd == nullptr) { free(sd_ctx); - return NULL; + return nullptr; } if (!sd_ctx->sd->init(sd_ctx_params)) { delete sd_ctx->sd; - sd_ctx->sd = NULL; + sd_ctx->sd = nullptr; free(sd_ctx); - return NULL; + return nullptr; } return sd_ctx; } void free_sd_ctx(sd_ctx_t* sd_ctx) { - if (sd_ctx->sd != NULL) { + if (sd_ctx->sd != nullptr) { delete sd_ctx->sd; - sd_ctx->sd = NULL; + sd_ctx->sd = nullptr; } free(sd_ctx); } enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx) { - if (sd_ctx != NULL && sd_ctx->sd != NULL) { + if (sd_ctx != nullptr && sd_ctx->sd != nullptr) { SDVersion version = sd_ctx->sd->version; if (sd_version_is_dit(version)) return EULER; @@ -2084,13 +2074,13 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, std::vector ref_images, std::vector ref_latents, bool increase_ref_index, - ggml_tensor* concat_latent = NULL, - ggml_tensor* denoise_mask = NULL) { + ggml_tensor* concat_latent = nullptr, + ggml_tensor* denoise_mask = nullptr) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library // by a third party with a seed <0, let's incorporate randomization here. - srand((int)time(NULL)); + srand((int)time(nullptr)); seed = rand(); } @@ -2111,7 +2101,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, // Photo Maker std::string prompt_text_only; - ggml_tensor* init_img = NULL; + ggml_tensor* init_img = nullptr; SDCondition id_cond; std::vector class_tokens_mask; @@ -2146,7 +2136,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, sd_image_f32_t id_image = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]); sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size); free(id_image.data); - id_image.data = NULL; + id_image.data = nullptr; processed_id_images.push_back(processed_id_image); } @@ -2157,7 +2147,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, for (auto& image : processed_id_images) { free(image.data); - image.data = NULL; + image.data = nullptr; } processed_id_images.clear(); @@ -2169,7 +2159,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, condition_params); id_cond = std::get<0>(cond_tup); class_tokens_mask = std::get<1>(cond_tup); // - struct ggml_tensor* id_embeds = NULL; + struct ggml_tensor* id_embeds = nullptr; if (pmv2 && pm_params.id_embed_path != nullptr) { id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path); // print_ggml_tensor(id_embeds, true, "id_embeds:"); @@ -2195,7 +2185,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, } // Get learned condition - t0 = ggml_time_ms(); condition_params.text = prompt; condition_params.zero_out_masked = false; SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, @@ -2223,8 +2212,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, } // Control net hint - struct ggml_tensor* image_hint = NULL; - if (control_image.data != NULL) { + struct ggml_tensor* image_hint = nullptr; + if (control_image.data != nullptr) { image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_image_to_tensor(control_image, image_hint); } @@ -2243,8 +2232,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, int H = height / 8; LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - struct ggml_tensor* control_latent = NULL; - if (sd_version_is_control(sd_ctx->sd->version) && image_hint != NULL) { + struct ggml_tensor* control_latent = nullptr; + if (sd_version_is_control(sd_ctx->sd->version) && image_hint != nullptr) { control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint); ggml_tensor_scale(control_latent, control_strength); } @@ -2282,8 +2271,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, } } - if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != NULL && sd_ctx->sd->control_net == NULL) { - bool no_inpaint = concat_latent == NULL; + if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != nullptr && sd_ctx->sd->control_net == nullptr) { + bool no_inpaint = concat_latent == nullptr; if (no_inpaint) { concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); } @@ -2302,33 +2291,33 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, } } } - } else if (concat_latent == NULL) { + } else if (concat_latent == nullptr) { concat_latent = empty_latent; } cond.c_concat = concat_latent; uncond.c_concat = empty_latent; - denoise_mask = NULL; + denoise_mask = nullptr; } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) { auto empty_latent = ggml_dup_tensor(work_ctx, init_latent); ggml_set_f32(empty_latent, 0); uncond.c_concat = empty_latent; cond.c_concat = ref_latents[0]; - if (cond.c_concat == NULL) { + if (cond.c_concat == nullptr) { cond.c_concat = empty_latent; } } else if (sd_version_is_control(sd_ctx->sd->version)) { auto empty_latent = ggml_dup_tensor(work_ctx, init_latent); ggml_set_f32(empty_latent, 0); uncond.c_concat = empty_latent; - if (sd_ctx->sd->control_net == NULL) { + if (sd_ctx->sd->control_net == nullptr) { cond.c_concat = control_latent; } - if (cond.c_concat == NULL) { + if (cond.c_concat == nullptr) { cond.c_concat = empty_latent; } } SDCondition img_cond; - if (uncond.c_crossattn != NULL && + if (uncond.c_crossattn != nullptr && (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); } @@ -2389,7 +2378,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, t1 = ggml_time_ms(); struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); // print_ggml_tensor(img); - if (img != NULL) { + if (img != nullptr) { decoded_images.push_back(img); } int64_t t2 = ggml_time_ms(); @@ -2402,9 +2391,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, sd_ctx->sd->first_stage_model->free_params_buffer(); } sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); - if (result_images == NULL) { + if (result_images == nullptr) { ggml_free(work_ctx); - return NULL; + return nullptr; } for (size_t i = 0; i < decoded_images.size(); i++) { @@ -2469,35 +2458,35 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g model_version_to_str[sd_ctx->sd->version], width, height); - return NULL; + return nullptr; } } else if (width % 64 || height % 64) { LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)", model_version_to_str[sd_ctx->sd->version], width, height); - return NULL; + return nullptr; } LOG_DEBUG("generate_image %dx%d", width, height); - if (sd_ctx == NULL || sd_img_gen_params == NULL) { - return NULL; + if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { + return nullptr; } struct ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); struct ggml_context* work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); - return NULL; + return nullptr; } int64_t seed = sd_img_gen_params->seed; if (seed < 0) { - srand((int)time(NULL)); + srand((int)time(nullptr)); seed = rand(); } sd_ctx->sd->rng->manual_seed(seed); @@ -2509,9 +2498,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_ctx->sd->init_scheduler(sd_img_gen_params->sample_params.scheduler); std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); - ggml_tensor* init_latent = NULL; - ggml_tensor* concat_latent = NULL; - ggml_tensor* denoise_mask = NULL; + ggml_tensor* init_latent = nullptr; + ggml_tensor* concat_latent = nullptr; + ggml_tensor* denoise_mask = nullptr; if (sd_img_gen_params->init_image.data) { LOG_INFO("IMG2IMG"); @@ -2538,7 +2527,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g } else if (sd_ctx->sd->version == VERSION_FLEX_2) { mask_channels = 1 + init_latent->ne[2]; } - ggml_tensor* masked_latent = NULL; + ggml_tensor* masked_latent = nullptr; if (sd_ctx->sd->version != VERSION_FLEX_2) { // most inpaint models mask before vae @@ -2635,14 +2624,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g std::vector ref_latents; for (int i = 0; i < ref_images.size(); i++) { ggml_tensor* img; - if (sd_version_is_qwen_image(sd_ctx->sd->version)) { + if (sd_img_gen_params->auto_resize_ref_image) { + LOG_DEBUG("auto resize ref images"); sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]); int VAE_IMAGE_SIZE = std::min(1024 * 1024, width * height); double vae_width = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height); double vae_height = vae_width * ref_image.height / ref_image.width; - vae_height = round(vae_height / 32) * 32; - vae_width = round(vae_width / 32) * 32; + int factor = 16; + if (sd_version_is_qwen_image(sd_ctx->sd->version)) { + factor = 32; + } + + vae_height = round(vae_height / factor) * factor; + vae_width = round(vae_width / factor) * factor; sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast(vae_width), static_cast(vae_height)); free(ref_image.data); @@ -2675,7 +2670,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g ref_latents.push_back(latent); } - if (sd_img_gen_params->init_image.data != NULL || sd_img_gen_params->ref_images_count > 0) { + if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) { size_t t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); } @@ -2717,8 +2712,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g } SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) { - if (sd_ctx == NULL || sd_vid_gen_params == NULL) { - return NULL; + if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) { + return nullptr; } std::string prompt = SAFE_STR(sd_vid_gen_params->prompt); @@ -2755,24 +2750,23 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } } LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps); - sample_steps = total_steps - high_noise_sample_steps; } struct ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); struct ggml_context* work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); - return NULL; + return nullptr; } int64_t seed = sd_vid_gen_params->seed; if (seed < 0) { - seed = (int)time(NULL); + seed = (int)time(nullptr); } sd_ctx->sd->rng->manual_seed(seed); @@ -2782,11 +2776,11 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s // Apply lora prompt = sd_ctx->sd->apply_loras_from_prompt(prompt); - ggml_tensor* init_latent = NULL; - ggml_tensor* clip_vision_output = NULL; - ggml_tensor* concat_latent = NULL; - ggml_tensor* denoise_mask = NULL; - ggml_tensor* vace_context = NULL; + ggml_tensor* init_latent = nullptr; + ggml_tensor* clip_vision_output = nullptr; + ggml_tensor* concat_latent = nullptr; + ggml_tensor* denoise_mask = nullptr; + ggml_tensor* vace_context = nullptr; int64_t ref_image_num = 0; // for vace if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" || sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" || @@ -2802,7 +2796,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") { - ggml_tensor* end_image_clip_vision_output = NULL; + ggml_tensor* end_image_clip_vision_output = nullptr; if (sd_vid_gen_params->end_image.data) { end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2); } else { @@ -2883,7 +2877,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") { LOG_INFO("VACE"); int64_t t1 = ggml_time_ms(); - ggml_tensor* ref_image_latent = NULL; + ggml_tensor* ref_image_latent = nullptr; if (sd_vid_gen_params->init_image.data) { ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_image_to_tensor(sd_vid_gen_params->init_image, ref_img); @@ -2956,7 +2950,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1); } - if (init_latent == NULL) { + if (init_latent == nullptr) { init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true); } @@ -3019,7 +3013,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s cond, uncond, {}, - NULL, + nullptr, 0, sd_vid_gen_params->high_noise_sample_params.guidance, sd_vid_gen_params->high_noise_sample_params.eta, @@ -3039,7 +3033,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->high_noise_diffusion_model->free_params_buffer(); } - noise = NULL; + noise = nullptr; } // Sample @@ -3055,7 +3049,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s cond, uncond, {}, - NULL, + nullptr, 0, sd_vid_gen_params->sample_params.guidance, sd_vid_gen_params->sample_params.eta, @@ -3101,9 +3095,9 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t)); - if (result_images == NULL) { + if (result_images == nullptr) { ggml_free(work_ctx); - return NULL; + return nullptr; } *num_frames_out = vid->ne[2]; diff --git a/stable-diffusion.h b/stable-diffusion.h index a891a58f..59a25cdc 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -216,6 +216,7 @@ typedef struct { sd_image_t init_image; sd_image_t* ref_images; int ref_images_count; + bool auto_resize_ref_image; bool increase_ref_index; sd_image_t mask_image; int width; @@ -292,7 +293,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t; SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, bool offload_params_to_cpu, bool direct, - int n_threads); + int n_threads, + int tile_size); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, diff --git a/t5.hpp b/t5.hpp index 15f7af80..1067a050 100644 --- a/t5.hpp +++ b/t5.hpp @@ -1,7 +1,7 @@ #ifndef __T5_HPP__ #define __T5_HPP__ -#include +#include #include #include #include @@ -461,7 +461,7 @@ class T5LayerNorm : public UnaryBlock { int64_t hidden_size; float eps; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { enum ggml_type wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -472,7 +472,7 @@ class T5LayerNorm : public UnaryBlock { : hidden_size(hidden_size), eps(eps) {} - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* w = params["weight"]; x = ggml_rms_norm(ctx, x, eps); x = ggml_mul(ctx, x, w); @@ -487,7 +487,7 @@ struct T5DenseActDense : public UnaryBlock { blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [N, n_token, model_dim] auto wi = std::dynamic_pointer_cast(blocks["wi"]); auto wo = std::dynamic_pointer_cast(blocks["wo"]); @@ -509,7 +509,7 @@ struct T5DenseGatedActDense : public UnaryBlock { blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [N, n_token, model_dim] auto wi_0 = std::dynamic_pointer_cast(blocks["wi_0"]); auto wi_1 = std::dynamic_pointer_cast(blocks["wi_1"]); @@ -530,7 +530,7 @@ struct T5LayerFF : public UnaryBlock { blocks["layer_norm"] = std::shared_ptr(new T5LayerNorm(model_dim)); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [N, n_token, model_dim] auto DenseReluDense = std::dynamic_pointer_cast(blocks["DenseReluDense"]); auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); @@ -582,9 +582,9 @@ class T5Attention : public GGMLBlock { std::pair forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, - struct ggml_tensor* past_bias = NULL, - struct ggml_tensor* mask = NULL, - struct ggml_tensor* relative_position_bucket = NULL) { + struct ggml_tensor* past_bias = nullptr, + struct ggml_tensor* mask = nullptr, + struct ggml_tensor* relative_position_bucket = nullptr) { auto q_proj = std::dynamic_pointer_cast(blocks["q"]); auto k_proj = std::dynamic_pointer_cast(blocks["k"]); auto v_proj = std::dynamic_pointer_cast(blocks["v"]); @@ -597,11 +597,11 @@ class T5Attention : public GGMLBlock { auto k = k_proj->forward(ctx, x); auto v = v_proj->forward(ctx, x); - if (using_relative_attention_bias && relative_position_bucket != NULL) { + if (using_relative_attention_bias && relative_position_bucket != nullptr) { past_bias = compute_bias(ctx, relative_position_bucket); } - if (past_bias != NULL) { - if (mask != NULL) { + if (past_bias != nullptr) { + if (mask != nullptr) { mask = ggml_repeat(ctx, mask, past_bias); mask = ggml_add(ctx, mask, past_bias); } else { @@ -632,9 +632,9 @@ struct T5LayerSelfAttention : public GGMLBlock { std::pair forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, - struct ggml_tensor* past_bias = NULL, - struct ggml_tensor* mask = NULL, - struct ggml_tensor* relative_position_bucket = NULL) { + struct ggml_tensor* past_bias = nullptr, + struct ggml_tensor* mask = nullptr, + struct ggml_tensor* relative_position_bucket = nullptr) { // x: [N, n_token, model_dim] auto SelfAttention = std::dynamic_pointer_cast(blocks["SelfAttention"]); auto layer_norm = std::dynamic_pointer_cast(blocks["layer_norm"]); @@ -659,9 +659,9 @@ struct T5Block : public GGMLBlock { std::pair forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, - struct ggml_tensor* past_bias = NULL, - struct ggml_tensor* mask = NULL, - struct ggml_tensor* relative_position_bucket = NULL) { + struct ggml_tensor* past_bias = nullptr, + struct ggml_tensor* mask = nullptr, + struct ggml_tensor* relative_position_bucket = nullptr) { // x: [N, n_token, model_dim] auto layer_0 = std::dynamic_pointer_cast(blocks["layer.0"]); auto layer_1 = std::dynamic_pointer_cast(blocks["layer.1"]); @@ -695,9 +695,9 @@ struct T5Stack : public GGMLBlock { struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, - struct ggml_tensor* past_bias = NULL, - struct ggml_tensor* attention_mask = NULL, - struct ggml_tensor* relative_position_bucket = NULL) { + struct ggml_tensor* past_bias = nullptr, + struct ggml_tensor* attention_mask = nullptr, + struct ggml_tensor* relative_position_bucket = nullptr) { // x: [N, n_token, model_dim] for (int i = 0; i < num_layers; i++) { auto block = std::dynamic_pointer_cast(blocks["block." + std::to_string(i)]); @@ -743,9 +743,9 @@ struct T5 : public GGMLBlock { struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* input_ids, - struct ggml_tensor* past_bias = NULL, - struct ggml_tensor* attention_mask = NULL, - struct ggml_tensor* relative_position_bucket = NULL) { + struct ggml_tensor* past_bias = nullptr, + struct ggml_tensor* attention_mask = nullptr, + struct ggml_tensor* relative_position_bucket = nullptr) { // input_ids: [N, n_token] auto shared = std::dynamic_pointer_cast(blocks["shared"]); @@ -776,7 +776,7 @@ struct T5Runner : public GGMLRunner { model.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return "t5"; } @@ -788,16 +788,16 @@ struct T5Runner : public GGMLRunner { ggml_backend_t backend, struct ggml_tensor* input_ids, struct ggml_tensor* relative_position_bucket, - struct ggml_tensor* attention_mask = NULL) { + struct ggml_tensor* attention_mask = nullptr) { size_t N = input_ids->ne[1]; size_t n_token = input_ids->ne[0]; - auto hidden_states = model.forward(ctx, backend, input_ids, NULL, attention_mask, relative_position_bucket); // [N, n_token, model_dim] + auto hidden_states = model.forward(ctx, backend, input_ids, nullptr, attention_mask, relative_position_bucket); // [N, n_token, model_dim] return hidden_states; } struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, - struct ggml_tensor* attention_mask = NULL) { + struct ggml_tensor* attention_mask = nullptr) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); @@ -829,7 +829,7 @@ struct T5Runner : public GGMLRunner { struct ggml_tensor* input_ids, struct ggml_tensor* attention_mask, ggml_tensor** output, - ggml_context* output_ctx = NULL) { + ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(input_ids, attention_mask); }; @@ -968,11 +968,11 @@ struct T5Embedder { void test() { struct ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); { std::string text("a lovely cat"); @@ -987,7 +987,7 @@ struct T5Embedder { printf("\n"); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); auto attention_mask = vector_to_ggml_tensor(work_ctx, masks); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); model.compute(8, input_ids, attention_mask, &out, work_ctx); @@ -1022,7 +1022,7 @@ struct T5Embedder { } } - std::shared_ptr t5 = std::shared_ptr(new T5Embedder(backend, false, tensor_types, "", true)); + std::shared_ptr t5 = std::make_shared(backend, false, tensor_types, "", true); t5->alloc_params_buffer(); std::map tensors; diff --git a/tae.hpp b/tae.hpp index 41bcbe2f..d630325d 100644 --- a/tae.hpp +++ b/tae.hpp @@ -29,7 +29,7 @@ class TAEBlock : public UnaryBlock { } } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [n, n_in, h, w] // return: [n, n_out, h, w] @@ -86,7 +86,7 @@ class TinyEncoder : public UnaryBlock { blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [n, in_channels, h, w] // return: [n, z_channels, h/8, w/8] @@ -136,7 +136,7 @@ class TinyDecoder : public UnaryBlock { blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) override { // z: [n, z_channels, h, w] // return: [n, out_channels, h*8, w*8] @@ -218,7 +218,7 @@ struct TinyAutoEncoder : public GGMLRunner { } } - std::string get_desc() { + std::string get_desc() override { return "taesd"; } @@ -261,7 +261,7 @@ struct TinyAutoEncoder : public GGMLRunner { struct ggml_tensor* z, bool decode_graph, struct ggml_tensor** output, - struct ggml_context* output_ctx = NULL) { + struct ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(z, decode_graph); }; diff --git a/unet.hpp b/unet.hpp index 19bedb32..7022a7c9 100644 --- a/unet.hpp +++ b/unet.hpp @@ -384,8 +384,8 @@ class UnetModelBlock : public GGMLBlock { struct ggml_tensor* x, struct ggml_tensor* timesteps, struct ggml_tensor* context, - struct ggml_tensor* c_concat = NULL, - struct ggml_tensor* y = NULL, + struct ggml_tensor* c_concat = nullptr, + struct ggml_tensor* y = nullptr, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f) { @@ -395,20 +395,20 @@ class UnetModelBlock : public GGMLBlock { // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w] // y: [N, adm_in_channels] or [1, adm_in_channels] // return: [N, out_channels, h, w] - if (context != NULL) { + if (context != nullptr) { if (context->ne[2] != x->ne[3]) { context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3])); } } - if (c_concat != NULL) { + if (c_concat != nullptr) { if (c_concat->ne[3] != x->ne[3]) { c_concat = ggml_repeat(ctx, c_concat, x); } x = ggml_concat(ctx, x, c_concat, 2); } - if (y != NULL) { + if (y != nullptr) { if (y->ne[1] != x->ne[3]) { y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3])); } @@ -428,7 +428,7 @@ class UnetModelBlock : public GGMLBlock { emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim] // SDXL/SVD - if (y != NULL) { + if (y != nullptr) { auto label_embed_0 = std::dynamic_pointer_cast(blocks["label_emb.0.0"]); auto label_embed_2 = std::dynamic_pointer_cast(blocks["label_emb.0.2"]); @@ -562,7 +562,7 @@ struct UNetModelRunner : public GGMLRunner { } } - std::string get_desc() { + std::string get_desc() override { return "unet"; } @@ -573,8 +573,8 @@ struct UNetModelRunner : public GGMLRunner { struct ggml_cgraph* build_graph(struct ggml_tensor* x, struct ggml_tensor* timesteps, struct ggml_tensor* context, - struct ggml_tensor* c_concat = NULL, - struct ggml_tensor* y = NULL, + struct ggml_tensor* c_concat = nullptr, + struct ggml_tensor* y = nullptr, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f) { @@ -619,8 +619,8 @@ struct UNetModelRunner : public GGMLRunner { int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size] @@ -636,11 +636,11 @@ struct UNetModelRunner : public GGMLRunner { void test() { struct ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); { // CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass @@ -663,10 +663,10 @@ struct UNetModelRunner : public GGMLRunner { ggml_set_f32(y, 0.5f); // print_ggml_tensor(y); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); - compute(8, x, timesteps, context, NULL, y, num_video_frames, {}, 0.f, &out, work_ctx); + compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); diff --git a/upscaler.cpp b/upscaler.cpp index d3042372..68eb50ef 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -4,17 +4,20 @@ #include "stable-diffusion.h" struct UpscalerGGML { - ggml_backend_t backend = NULL; // general backend + ggml_backend_t backend = nullptr; // general backend ggml_type model_data_type = GGML_TYPE_F16; std::shared_ptr esrgan_upscaler; std::string esrgan_path; int n_threads; bool direct = false; + int tile_size = 128; UpscalerGGML(int n_threads, - bool direct = false) + bool direct = false, + int tile_size = 128) : n_threads(n_threads), - direct(direct) { + direct(direct), + tile_size(tile_size) { } bool load_from_file(const std::string& esrgan_path, @@ -51,7 +54,7 @@ struct UpscalerGGML { backend = ggml_backend_cpu_init(); } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); - esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, model_loader.tensor_storages_types); + esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, tile_size, model_loader.tensor_storages_types); if (direct) { esrgan_upscaler->enable_conv2d_direct(); } @@ -63,7 +66,7 @@ struct UpscalerGGML { sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) { // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth - sd_image_t upscaled_image = {0, 0, 0, NULL}; + sd_image_t upscaled_image = {0, 0, 0, nullptr}; int output_width = (int)input_image.width * esrgan_upscaler->scale; int output_height = (int)input_image.height * esrgan_upscaler->scale; LOG_INFO("upscaling from (%i x %i) to (%i x %i)", @@ -71,7 +74,7 @@ struct UpscalerGGML { struct ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; // draft context @@ -107,29 +110,30 @@ struct UpscalerGGML { }; struct upscaler_ctx_t { - UpscalerGGML* upscaler = NULL; + UpscalerGGML* upscaler = nullptr; }; upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, bool offload_params_to_cpu, bool direct, - int n_threads) { + int n_threads, + int tile_size) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); - if (upscaler_ctx == NULL) { - return NULL; + if (upscaler_ctx == nullptr) { + return nullptr; } std::string esrgan_path(esrgan_path_c_str); - upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct); - if (upscaler_ctx->upscaler == NULL) { - return NULL; + upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size); + if (upscaler_ctx->upscaler == nullptr) { + return nullptr; } if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) { delete upscaler_ctx->upscaler; - upscaler_ctx->upscaler = NULL; + upscaler_ctx->upscaler = nullptr; free(upscaler_ctx); - return NULL; + return nullptr; } return upscaler_ctx; } @@ -139,16 +143,16 @@ sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_ } int get_upscale_factor(upscaler_ctx_t* upscaler_ctx) { - if (upscaler_ctx == NULL || upscaler_ctx->upscaler == NULL || upscaler_ctx->upscaler->esrgan_upscaler == NULL) { + if (upscaler_ctx == nullptr || upscaler_ctx->upscaler == nullptr || upscaler_ctx->upscaler->esrgan_upscaler == nullptr) { return 1; } return upscaler_ctx->upscaler->esrgan_upscaler->scale; } void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx) { - if (upscaler_ctx->upscaler != NULL) { + if (upscaler_ctx->upscaler != nullptr) { delete upscaler_ctx->upscaler; - upscaler_ctx->upscaler = NULL; + upscaler_ctx->upscaler = nullptr; } free(upscaler_ctx); } diff --git a/util.cpp b/util.cpp index 1d0bbd2b..d6d06752 100644 --- a/util.cpp +++ b/util.cpp @@ -1,8 +1,8 @@ #include "util.h" -#include #include #include #include +#include #include #include #include @@ -64,7 +64,7 @@ std::string format(const char* fmt, ...) { va_list ap2; va_start(ap, fmt); va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); + int size = vsnprintf(nullptr, 0, fmt, ap); std::vector buf(size + 1); int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); va_end(ap2); @@ -170,11 +170,11 @@ int32_t get_num_physical_cores() { #elif defined(__APPLE__) && defined(__MACH__) int32_t num_physical_cores; size_t len = sizeof(num_physical_cores); - int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); + int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0); if (result == 0) { return num_physical_cores; } - result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0); + result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0); if (result == 0) { return num_physical_cores; } @@ -185,8 +185,8 @@ int32_t get_num_physical_cores() { return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -static sd_progress_cb_t sd_progress_cb = NULL; -void* sd_progress_cb_data = NULL; +static sd_progress_cb_t sd_progress_cb = nullptr; +void* sd_progress_cb_data = nullptr; std::u32string utf8_to_utf32(const std::string& utf8_str) { std::wstring_convert, char32_t> converter; @@ -296,8 +296,8 @@ std::string trim(const std::string& s) { return rtrim(ltrim(s)); } -static sd_log_cb_t sd_log_cb = NULL; -void* sd_log_cb_data = NULL; +static sd_log_cb_t sd_log_cb = nullptr; +void* sd_log_cb_data = nullptr; #define LOG_BUFFER_SIZE 4096 diff --git a/vae.hpp b/vae.hpp index 20d97a2a..455edae0 100644 --- a/vae.hpp +++ b/vae.hpp @@ -30,7 +30,7 @@ class ResnetBlock : public UnaryBlock { } } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [N, in_channels, h, w] // t_emb is always None auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); @@ -76,7 +76,7 @@ class AttnBlock : public UnaryBlock { blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [N, in_channels, h, w] auto norm = std::dynamic_pointer_cast(blocks["norm"]); auto q_proj = std::dynamic_pointer_cast(blocks["q"]); @@ -134,7 +134,7 @@ class AE3DConv : public Conv2d { } struct ggml_tensor* forward(struct ggml_context* ctx, - struct ggml_tensor* x) { + struct ggml_tensor* x) override { // timesteps always None // skip_video always False // x: [N, IC, IH, IW] @@ -163,7 +163,7 @@ class AE3DConv : public Conv2d { class VideoResnetBlock : public ResnetBlock { protected: - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_types, GGML_TYPE_F32); params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); } @@ -182,7 +182,7 @@ class VideoResnetBlock : public ResnetBlock { blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true)); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w] // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w] // t_emb is always None @@ -548,7 +548,7 @@ struct AutoEncoderKL : public VAE { ae.init(params_ctx, tensor_types, prefix); } - void enable_conv2d_direct() { + void enable_conv2d_direct() override { std::vector blocks; ae.get_all_blocks(blocks); for (auto block : blocks) { @@ -559,7 +559,7 @@ struct AutoEncoderKL : public VAE { } } - void set_conv2d_scale(float scale) { + void set_conv2d_scale(float scale) override { std::vector blocks; ae.get_all_blocks(blocks); for (auto block : blocks) { @@ -570,11 +570,11 @@ struct AutoEncoderKL : public VAE { } } - std::string get_desc() { + std::string get_desc() override { return "vae"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) override { ae.get_param_tensors(tensors, prefix); } @@ -594,7 +594,7 @@ struct AutoEncoderKL : public VAE { struct ggml_tensor* z, bool decode_graph, struct ggml_tensor** output, - struct ggml_context* output_ctx = NULL) { + struct ggml_context* output_ctx = nullptr) override { GGML_ASSERT(!decode_only || decode_graph); auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(z, decode_graph); @@ -607,11 +607,11 @@ struct AutoEncoderKL : public VAE { void test() { struct ggml_init_params params; params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); { // CPU, x{1, 3, 64, 64}: Pass @@ -621,7 +621,7 @@ struct AutoEncoderKL : public VAE { auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2); ggml_set_f32(x, 0.5f); print_ggml_tensor(x); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); compute(8, x, false, &out, work_ctx); @@ -639,7 +639,7 @@ struct AutoEncoderKL : public VAE { auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); ggml_set_f32(z, 0.5f); print_ggml_tensor(z); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); compute(8, z, true, &out, work_ctx); diff --git a/wan.hpp b/wan.hpp index 31fa90b3..b6a08fde 100644 --- a/wan.hpp +++ b/wan.hpp @@ -2,6 +2,8 @@ #define __WAN_HPP__ #include +#include +#include #include "common.hpp" #include "flux.hpp" @@ -24,7 +26,7 @@ namespace WAN { std::tuple dilation; bool bias; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { params["weight"] = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, std::get<2>(kernel_size), @@ -46,17 +48,17 @@ namespace WAN { bool bias = true) : in_channels(in_channels), out_channels(out_channels), - kernel_size(kernel_size), - stride(stride), - padding(padding), - dilation(dilation), + kernel_size(std::move(kernel_size)), + stride(std::move(stride)), + padding(std::move(padding)), + dilation(std::move(dilation)), bias(bias) {} - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* cache_x = NULL) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* cache_x = nullptr) { // x: [N*IC, ID, IH, IW] // result: x: [N*OC, ID, IH, IW] struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = NULL; + struct ggml_tensor* b = nullptr; if (bias) { b = params["bias"]; } @@ -68,7 +70,7 @@ namespace WAN { int lp2 = 2 * std::get<0>(padding); int rp2 = 0; - if (cache_x != NULL && lp2 > 0) { + if (cache_x != nullptr && lp2 > 0) { x = ggml_concat(ctx, cache_x, x, 2); lp2 -= (int)cache_x->ne[2]; } @@ -85,7 +87,7 @@ namespace WAN { protected: int64_t dim; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { ggml_type wtype = GGML_TYPE_F32; params["gamma"] = ggml_new_tensor_1d(ctx, wtype, dim); } @@ -94,7 +96,7 @@ namespace WAN { RMS_norm(int64_t dim) : dim(dim) {} - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [N*IC, ID, IH, IW], IC == dim // assert N == 1 @@ -159,12 +161,12 @@ namespace WAN { int idx = feat_idx; feat_idx += 1; if (chunk_idx == 0) { - // feat_cache[idx] == NULL, pass + // feat_cache[idx] == nullptr, pass } else { auto time_conv = std::dynamic_pointer_cast(blocks["time_conv"]); auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]); - if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) { // chunk_idx >= 2 + if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) { // chunk_idx >= 2 // cache last frame of last two chunk cache_x = ggml_concat(ctx, ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]), @@ -209,7 +211,7 @@ namespace WAN { if (mode == "downsample3d") { if (feat_cache.size() > 0) { int idx = feat_idx; - if (feat_cache[idx] == NULL) { + if (feat_cache[idx] == nullptr) { feat_cache[idx] = x; feat_idx += 1; } else { @@ -373,7 +375,7 @@ namespace WAN { if (feat_cache.size() > 0) { int idx = feat_idx; auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]); - if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) { + if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) { // cache last frame of last two chunk cache_x = ggml_concat(ctx, ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]), @@ -566,7 +568,7 @@ namespace WAN { x = ggml_nn_attention(ctx, q, k, v, false); // [t, h * w, c] // v = ggml_cont(ctx, ggml_torch_permute(ctx, v, 1, 0, 2, 3)); // [t, h * w, c] - // x = ggml_nn_attention_ext(ctx, q, k, v, q->ne[2], NULL, false, false, true); + // x = ggml_nn_attention_ext(ctx, q, k, v, q->ne[2], nullptr, false, false, true); x = ggml_nn_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [t, c, h * w] x = ggml_reshape_4d(ctx, x, w, h, c, n); // [t, c, h, w] @@ -672,7 +674,7 @@ namespace WAN { if (feat_cache.size() > 0) { int idx = feat_idx; auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]); - if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) { + if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) { // cache last frame of last two chunk cache_x = ggml_concat(ctx, ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]), @@ -724,7 +726,7 @@ namespace WAN { if (feat_cache.size() > 0) { int idx = feat_idx; auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]); - if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) { + if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) { // cache last frame of last two chunk cache_x = ggml_concat(ctx, ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]), @@ -843,7 +845,7 @@ namespace WAN { if (feat_cache.size() > 0) { int idx = feat_idx; auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]); - if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) { + if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) { // cache last frame of last two chunk cache_x = ggml_concat(ctx, ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]), @@ -895,7 +897,7 @@ namespace WAN { if (feat_cache.size() > 0) { int idx = feat_idx; auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]); - if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) { + if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) { // cache last frame of last two chunk cache_x = ggml_concat(ctx, ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]), @@ -935,9 +937,9 @@ namespace WAN { void clear_cache() { _conv_idx = 0; - _feat_map = std::vector(_conv_num, NULL); + _feat_map = std::vector(_conv_num, nullptr); _enc_conv_idx = 0; - _enc_feat_map = std::vector(_enc_conv_num, NULL); + _enc_feat_map = std::vector(_enc_conv_num, nullptr); } public: @@ -1116,11 +1118,11 @@ namespace WAN { ae.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return "wan_vae"; } - void get_param_tensors(std::map& tensors, const std::string prefix) { + void get_param_tensors(std::map& tensors, const std::string prefix) override { ae.get_param_tensors(tensors, prefix); } @@ -1152,7 +1154,7 @@ namespace WAN { for (int64_t feat_idx = 0; feat_idx < ae._feat_map.size(); feat_idx++) { ggml_tensor* feat_cache = ae._feat_map[feat_idx]; - if (feat_cache != NULL) { + if (feat_cache != nullptr) { cache("feat_idx:" + std::to_string(feat_idx), feat_cache); ggml_build_forward_expand(gf, feat_cache); } @@ -1167,7 +1169,7 @@ namespace WAN { struct ggml_tensor* z, bool decode_graph, struct ggml_tensor** output, - struct ggml_context* output_ctx = NULL) { + struct ggml_context* output_ctx = nullptr) override { if (true) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(z, decode_graph); @@ -1180,7 +1182,7 @@ namespace WAN { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph_partial(z, decode_graph, i); }; - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx); ae.clear_cache(); if (t == 1) { @@ -1220,11 +1222,11 @@ namespace WAN { void test() { struct ggml_init_params params; params.mem_size = static_cast(1024 * 1024) * 1024; // 1G - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); if (true) { // cpu f32, pass @@ -1235,7 +1237,7 @@ namespace WAN { ggml_set_f32(z, 0.5f); z = load_tensor_from_file(work_ctx, "wan_vae_z.bin"); print_ggml_tensor(z); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int64_t t0 = ggml_time_ms(); compute(8, z, true, &out, work_ctx); @@ -1250,7 +1252,7 @@ namespace WAN { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_F16; - std::shared_ptr vae = std::shared_ptr(new WanVAERunner(backend, false, {}, "", false, VERSION_WAN2_2_TI2V)); + std::shared_ptr vae = std::make_shared(backend, false, String2GGMLType{}, "", false, VERSION_WAN2_2_TI2V); { LOG_INFO("loading from '%s'", file_path.c_str()); @@ -1309,7 +1311,7 @@ namespace WAN { ggml_backend_t backend, struct ggml_tensor* x, struct ggml_tensor* pe, - struct ggml_tensor* mask = NULL) { + struct ggml_tensor* mask = nullptr) { // x: [N, n_token, dim] // pe: [n_token, d_head/2, 2, 2] // return [N, n_token, dim] @@ -1367,7 +1369,7 @@ namespace WAN { ggml_backend_t backend, struct ggml_tensor* x, struct ggml_tensor* context, - int64_t context_img_len) { + int64_t context_img_len) override { // x: [N, n_token, dim] // context: [N, n_context, dim] // context_img_len: unused @@ -1388,7 +1390,7 @@ namespace WAN { k = norm_k->forward(ctx, k); auto v = v_proj->forward(ctx, context); // [N, n_context, dim] - x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, NULL, false, false, flash_attn); // [N, n_token, dim] + x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] x = o_proj->forward(ctx, x); // [N, n_token, dim] return x; @@ -1417,7 +1419,7 @@ namespace WAN { ggml_backend_t backend, struct ggml_tensor* x, struct ggml_tensor* context, - int64_t context_img_len) { + int64_t context_img_len) override { // x: [N, n_token, dim] // context: [N, context_img_len + context_txt_len, dim] // return [N, n_token, dim] @@ -1455,8 +1457,8 @@ namespace WAN { k_img = norm_k_img->forward(ctx, k_img); auto v_img = v_img_proj->forward(ctx, context_img); // [N, context_img_len, dim] - auto img_x = ggml_nn_attention_ext(ctx, backend, q, k_img, v_img, num_heads, NULL, false, false, flash_attn); // [N, n_token, dim] - x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, NULL, false, false, flash_attn); // [N, n_token, dim] + auto img_x = ggml_nn_attention_ext(ctx, backend, q, k_img, v_img, num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] + x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, nullptr, false, false, flash_attn); // [N, n_token, dim] x = ggml_add(ctx, x, img_x); @@ -1497,7 +1499,7 @@ namespace WAN { protected: int dim; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 6, 1); } @@ -1587,7 +1589,7 @@ namespace WAN { class VaceWanAttentionBlock : public WanAttentionBlock { protected: int block_id; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 6, 1); } @@ -1641,7 +1643,7 @@ namespace WAN { protected: int dim; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 2, 1); } @@ -1688,7 +1690,7 @@ namespace WAN { int in_dim; int flf_pos_embed_token_number; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { if (flf_pos_embed_token_number > 0) { params["emb_pos"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, in_dim, flf_pos_embed_token_number, 1); } @@ -1876,8 +1878,8 @@ namespace WAN { struct ggml_tensor* timestep, struct ggml_tensor* context, struct ggml_tensor* pe, - struct ggml_tensor* clip_fea = NULL, - struct ggml_tensor* vace_context = NULL, + struct ggml_tensor* clip_fea = nullptr, + struct ggml_tensor* vace_context = nullptr, float vace_strength = 1.f, int64_t N = 1) { // x: [N*C, T, H, W], C => in_dim @@ -1920,7 +1922,7 @@ namespace WAN { context = text_embedding_2->forward(ctx, context); // [N, context_txt_len, dim] int64_t context_img_len = 0; - if (clip_fea != NULL) { + if (clip_fea != nullptr) { if (params.model_type == "i2v") { auto img_emb = std::dynamic_pointer_cast(blocks["img_emb"]); auto context_img = img_emb->forward(ctx, clip_fea); // [N, context_img_len, dim] @@ -1930,7 +1932,7 @@ namespace WAN { } // vace_patch_embedding - ggml_tensor* c = NULL; + ggml_tensor* c = nullptr; if (params.vace_layers > 0) { auto vace_patch_embedding = std::dynamic_pointer_cast(blocks["vace_patch_embedding"]); @@ -1971,9 +1973,9 @@ namespace WAN { struct ggml_tensor* timestep, struct ggml_tensor* context, struct ggml_tensor* pe, - struct ggml_tensor* clip_fea = NULL, - struct ggml_tensor* time_dim_concat = NULL, - struct ggml_tensor* vace_context = NULL, + struct ggml_tensor* clip_fea = nullptr, + struct ggml_tensor* time_dim_concat = nullptr, + struct ggml_tensor* vace_context = nullptr, float vace_strength = 1.f, int64_t N = 1) { // Forward pass of DiT. @@ -1997,7 +1999,7 @@ namespace WAN { int64_t h_len = ((H + (std::get<1>(params.patch_size) / 2)) / std::get<1>(params.patch_size)); int64_t w_len = ((W + (std::get<2>(params.patch_size) / 2)) / std::get<2>(params.patch_size)); - if (time_dim_concat != NULL) { + if (time_dim_concat != nullptr) { time_dim_concat = pad_to_patch_size(ctx, time_dim_concat); x = ggml_concat(ctx, x, time_dim_concat, 2); // [N*C, (T+pad_t) + (T2+pad_t2), H + pad_h, W + pad_w] t_len = ((x->ne[2] + (std::get<0>(params.patch_size) / 2)) / std::get<0>(params.patch_size)); @@ -2134,7 +2136,7 @@ namespace WAN { wan.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return desc; } @@ -2145,10 +2147,10 @@ namespace WAN { struct ggml_cgraph* build_graph(struct ggml_tensor* x, struct ggml_tensor* timesteps, struct ggml_tensor* context, - struct ggml_tensor* clip_fea = NULL, - struct ggml_tensor* c_concat = NULL, - struct ggml_tensor* time_dim_concat = NULL, - struct ggml_tensor* vace_context = NULL, + struct ggml_tensor* clip_fea = nullptr, + struct ggml_tensor* c_concat = nullptr, + struct ggml_tensor* time_dim_concat = nullptr, + struct ggml_tensor* vace_context = nullptr, float vace_strength = 1.f) { struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, WAN_GRAPH_SIZE, false); @@ -2174,10 +2176,10 @@ namespace WAN { auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, wan_params.axes_dim_sum / 2, pos_len); // pe->data = pe_vec.data(); // print_ggml_tensor(pe); - // pe->data = NULL; + // pe->data = nullptr; set_backend_tensor_data(pe, pe_vec.data()); - if (c_concat != NULL) { + if (c_concat != nullptr) { x = ggml_concat(compute_ctx, x, c_concat, 3); } @@ -2201,13 +2203,13 @@ namespace WAN { struct ggml_tensor* x, struct ggml_tensor* timesteps, struct ggml_tensor* context, - struct ggml_tensor* clip_fea = NULL, - struct ggml_tensor* c_concat = NULL, - struct ggml_tensor* time_dim_concat = NULL, - struct ggml_tensor* vace_context = NULL, + struct ggml_tensor* clip_fea = nullptr, + struct ggml_tensor* c_concat = nullptr, + struct ggml_tensor* time_dim_concat = nullptr, + struct ggml_tensor* vace_context = nullptr, float vace_strength = 1.f, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL) { + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) { auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength); }; @@ -2218,11 +2220,11 @@ namespace WAN { void test() { struct ggml_init_params params; params.mem_size = static_cast(200 * 1024 * 1024); // 200 MB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); { // cpu f16: pass @@ -2244,10 +2246,10 @@ namespace WAN { // auto clip_fea = load_tensor_from_file(work_ctx, "wan_dit_clip_fea.bin"); // print_ggml_tensor(clip_fea); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); - compute(8, x, timesteps, context, NULL, NULL, NULL, NULL, 1.f, &out, work_ctx); + compute(8, x, timesteps, context, nullptr, nullptr, nullptr, nullptr, 1.f, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); @@ -2275,12 +2277,12 @@ namespace WAN { } } - std::shared_ptr wan = std::shared_ptr(new WanRunner(backend, - false, - tensor_types, - "model.diffusion_model", - VERSION_WAN2_2_TI2V, - true)); + std::shared_ptr wan = std::make_shared(backend, + false, + tensor_types, + "model.diffusion_model", + VERSION_WAN2_2_TI2V, + true); wan->alloc_params_buffer(); std::map tensors;