diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000..63924a0b
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,10 @@
+Checks: >
+  modernize-make-shared,
+  modernize-use-nullptr,
+  modernize-use-override,
+  modernize-pass-by-value,
+  modernize-return-braced-init-list,
+  modernize-deprecated-headers,
+HeaderFilterRegex: '^$'
+WarningsAsErrors: ''
+FormatStyle: none
\ No newline at end of file
diff --git a/clip.hpp b/clip.hpp
index 12d9f4f6..296ca9aa 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -550,7 +550,7 @@ class CLIPEmbeddings : public GGMLBlock {
     int64_t num_positions;
     bool force_clip_f32;
 
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
         enum ggml_type token_wtype = GGML_TYPE_F32;
         if (!force_clip_f32) {
             token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32);
@@ -587,7 +587,7 @@ class CLIPEmbeddings : public GGMLBlock {
 
         GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
         input_ids            = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
-        auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids);
+        auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
         token_embedding      = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
 
         // token_embedding + position_embedding
@@ -606,7 +606,7 @@ class CLIPVisionEmbeddings : public GGMLBlock {
     int64_t image_size;
     int64_t num_patches;
     int64_t num_positions;
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
         enum ggml_type patch_wtype    = GGML_TYPE_F16;
         enum ggml_type class_wtype    = GGML_TYPE_F32;
         enum ggml_type position_wtype = GGML_TYPE_F32;
@@ -641,10 +641,10 @@ class CLIPVisionEmbeddings : public GGMLBlock {
         // concat(patch_embedding, class_embedding) + position_embedding
         struct ggml_tensor* patch_embedding;
         int64_t N       = pixel_values->ne[3];
-        patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
-        patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N);                      // [N, embed_dim, num_patches]
-        patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3));                        // [N, num_patches, embed_dim]
-        patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N);                   // [N, num_patches, embed_dim, 1]
+        patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+        patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N);                         // [N, embed_dim, num_patches]
+        patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3));                           // [N, num_patches, embed_dim]
+        patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N);                      // [N, num_patches, embed_dim, 1]
 
         struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
         class_embedding                     = ggml_repeat(ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
@@ -669,7 +669,7 @@ enum CLIPVersion {
 
 class CLIPTextModel : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
         if (version == OPEN_CLIP_VIT_BIGG_14) {
             enum ggml_type wtype      = GGML_TYPE_F32;
             params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@@ -735,8 +735,8 @@ class CLIPTextModel : public GGMLBlock {
         if (return_pooled) {
             auto text_projection = params["text_projection"];
             ggml_tensor* pooled  = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
-            if (text_projection != NULL) {
-                pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
+            if (text_projection != nullptr) {
+                pooled = ggml_nn_linear(ctx, pooled, text_projection, nullptr);
             } else {
                 LOG_DEBUG("identity projection");
             }
@@ -814,7 +814,7 @@ class CLIPProjection : public UnaryBlock {
     int64_t out_features;
     bool transpose_weight;
 
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
         enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
         if (transpose_weight) {
             params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
@@ -831,12 +831,12 @@ class CLIPProjection : public UnaryBlock {
           out_features(out_features),
           transpose_weight(transpose_weight) {}
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         struct ggml_tensor* w = params["weight"];
         if (transpose_weight) {
             w = ggml_cont(ctx, ggml_transpose(ctx, w));
         }
-        return ggml_nn_linear(ctx, x, w, NULL);
+        return ggml_nn_linear(ctx, x, w, nullptr);
     }
 };
 
@@ -894,7 +894,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
         model.init(params_ctx, tensor_types, prefix);
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "clip";
     }
 
@@ -921,7 +921,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
                                     int num_custom_embeddings    = 0,
-                                    void* custom_embeddings_data = NULL,
+                                    void* custom_embeddings_data = nullptr,
                                     size_t max_token_idx         = 0,
                                     bool return_pooled           = false,
                                     int clip_skip                = -1) {
@@ -929,9 +929,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
 
         input_ids = to_backend(input_ids);
 
-        struct ggml_tensor* embeddings = NULL;
+        struct ggml_tensor* embeddings = nullptr;
 
-        if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
+        if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
             auto token_embed_weight = model.get_token_embed_weight();
             auto custom_embeddings  = ggml_new_tensor_2d(compute_ctx,
                                                          token_embed_weight->type,
@@ -958,7 +958,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
                  bool return_pooled,
                  int clip_skip,
                  ggml_tensor** output,
-                 ggml_context* output_ctx = NULL) {
+                 ggml_context* output_ctx = nullptr) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
             return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
         };
diff --git a/common.hpp b/common.hpp
index d3216714..7cc95d5b 100644
--- a/common.hpp
+++ b/common.hpp
@@ -121,7 +121,7 @@ class ResBlock : public GGMLBlock {
         }
     }
 
-    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
         // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
         // [N, c, t, h, w] => [N, c, t, h * w]
         // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@@ -131,7 +131,7 @@ class ResBlock : public GGMLBlock {
         auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
         auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
 
-        if (emb == NULL) {
+        if (emb == nullptr) {
             GGML_ASSERT(skip_t_emb);
         }
 
@@ -182,7 +182,7 @@ class GEGLU : public UnaryBlock {
     int64_t dim_in;
     int64_t dim_out;
 
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
         enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
         enum ggml_type bias_wtype = GGML_TYPE_F32;
         params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
@@ -193,7 +193,7 @@ class GEGLU : public UnaryBlock {
     GEGLU(int64_t dim_in, int64_t dim_out)
         : dim_in(dim_in), dim_out(dim_out) {}
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [ne3, ne2, ne1, dim_in]
         // return: [ne3, ne2, ne1, dim_out]
         struct ggml_tensor* w = params["proj.weight"];
@@ -222,7 +222,7 @@ class GELU : public UnaryBlock {
         blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [ne3, ne2, ne1, dim_in]
         // return: [ne3, ne2, ne1, dim_out]
         auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@@ -325,7 +325,7 @@ class CrossAttention : public GGMLBlock {
         auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
         auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
 
-        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, false, false, flash_attn);  // [N, n_token, inner_dim]
+        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, nullptr, false, false, flash_attn);  // [N, n_token, inner_dim]
 
         x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
         return x;
@@ -483,7 +483,7 @@ class SpatialTransformer : public GGMLBlock {
 
 class AlphaBlender : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
         // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
         enum ggml_type wtype = GGML_TYPE_F32;
         params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
diff --git a/conditioner.hpp b/conditioner.hpp
index 4f9efb8c..e299d367 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -6,9 +6,9 @@
 #include "t5.hpp"
 
 struct SDCondition {
-    struct ggml_tensor* c_crossattn = NULL;  // aka context
-    struct ggml_tensor* c_vector    = NULL;  // aka y
-    struct ggml_tensor* c_concat    = NULL;
+    struct ggml_tensor* c_crossattn = nullptr;  // aka context
+    struct ggml_tensor* c_vector    = nullptr;  // aka y
+    struct ggml_tensor* c_concat    = nullptr;
 
     SDCondition() = default;
     SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat)
@@ -79,28 +79,28 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         }
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
         if (sd_version_is_sdxl(version)) {
             text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
         }
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         text_model->alloc_params_buffer();
         if (sd_version_is_sdxl(version)) {
             text_model2->alloc_params_buffer();
         }
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         text_model->free_params_buffer();
         if (sd_version_is_sdxl(version)) {
             text_model2->free_params_buffer();
         }
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         size_t buffer_size = text_model->get_params_buffer_size();
         if (sd_version_is_sdxl(version)) {
             buffer_size += text_model2->get_params_buffer_size();
@@ -121,11 +121,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         }
         struct ggml_init_params params;
         params.mem_size               = 100 * 1024 * 1024;  // max for custom embeddings 100 MB
-        params.mem_buffer             = NULL;
+        params.mem_buffer             = nullptr;
         params.no_alloc               = false;
         struct ggml_context* embd_ctx = ggml_init(params);
-        struct ggml_tensor* embd      = NULL;
-        struct ggml_tensor* embd2     = NULL;
+        struct ggml_tensor* embd      = nullptr;
+        struct ggml_tensor* embd2     = nullptr;
         auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
             if (tensor_storage.ne[0] != text_model->model.hidden_size) {
                 if (text_model2) {
@@ -404,11 +404,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              int adm_in_channels  = -1,
                                              bool zero_out_masked = false) {
         int64_t t0                               = ggml_time_ms();
-        struct ggml_tensor* hidden_states        = NULL;  // [N, n_token, hidden_size]
-        struct ggml_tensor* chunk_hidden_states  = NULL;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
-        struct ggml_tensor* chunk_hidden_states1 = NULL;  // [n_token, hidden_size]
-        struct ggml_tensor* chunk_hidden_states2 = NULL;  // [n_token, hidden_size2]
-        struct ggml_tensor* pooled               = NULL;
+        struct ggml_tensor* hidden_states        = nullptr;  // [N, n_token, hidden_size]
+        struct ggml_tensor* chunk_hidden_states  = nullptr;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
+        struct ggml_tensor* chunk_hidden_states1 = nullptr;  // [n_token, hidden_size]
+        struct ggml_tensor* chunk_hidden_states2 = nullptr;  // [n_token, hidden_size2]
+        struct ggml_tensor* pooled               = nullptr;
         std::vector<float> hidden_states_vec;
 
         if (clip_skip <= 0) {
@@ -424,7 +424,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              weights.begin() + (chunk_idx + 1) * chunk_len);
 
             auto input_ids                 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-            struct ggml_tensor* input_ids2 = NULL;
+            struct ggml_tensor* input_ids2 = nullptr;
             size_t max_token_idx           = 0;
             if (sd_version_is_sdxl(version)) {
                 auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
@@ -512,7 +512,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                         chunk_hidden_states->ne[0],
                                         ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
 
-        ggml_tensor* vec = NULL;
+        ggml_tensor* vec = nullptr;
         if (sd_version_is_sdxl(version)) {
             int out_dim = 256;
             vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
@@ -549,13 +549,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             GGML_ASSERT(offset == ggml_nbytes(vec));
         }
         // print_ggml_tensor(result);
-        return SDCondition(hidden_states, vec, NULL);
+        return {hidden_states, vec, nullptr};
     }
 
     std::tuple<SDCondition, std::vector<bool>>
     get_learned_condition_with_trigger(ggml_context* work_ctx,
                                        int n_threads,
-                                       const ConditionerParams& conditioner_params) {
+                                       const ConditionerParams& conditioner_params) override {
         auto image_tokens = convert_token_to_id(trigger_word);
         // if(image_tokens.size() == 1){
         //     printf(" image token id is: %d \n", image_tokens[0]);
@@ -589,7 +589,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     }
 
     std::string remove_trigger_from_prompt(ggml_context* work_ctx,
-                                           const std::string& prompt) {
+                                           const std::string& prompt) override {
         auto image_tokens = convert_token_to_id(trigger_word);
         GGML_ASSERT(image_tokens.size() == 1);
         auto tokens_and_weights  = tokenize(prompt, false);
@@ -602,7 +602,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
     SDCondition get_learned_condition(ggml_context* work_ctx,
                                       int n_threads,
-                                      const ConditionerParams& conditioner_params) {
+                                      const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights     = tokenize(conditioner_params.text, true);
         std::vector<int>& tokens    = tokens_and_weights.first;
         std::vector<float>& weights = tokens_and_weights.second;
@@ -628,7 +628,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
         vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "clip_vision";
     }
 
@@ -678,25 +678,25 @@ struct SD3CLIPEmbedder : public Conditioner {
         t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
         clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
         t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         clip_l->alloc_params_buffer();
         clip_g->alloc_params_buffer();
         t5->alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         clip_l->free_params_buffer();
         clip_g->free_params_buffer();
         t5->free_params_buffer();
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         size_t buffer_size = clip_l->get_params_buffer_size();
         buffer_size += clip_g->get_params_buffer_size();
         buffer_size += t5->get_params_buffer_size();
@@ -747,7 +747,7 @@ struct SD3CLIPEmbedder : public Conditioner {
 
         clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
         clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
-        t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
+        t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
 
         // for (int i = 0; i < clip_l_tokens.size(); i++) {
         //     std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@@ -784,14 +784,14 @@ struct SD3CLIPEmbedder : public Conditioner {
         }
 
         int64_t t0                                 = ggml_time_ms();
-        struct ggml_tensor* hidden_states          = NULL;  // [N, n_token*2, 4096]
-        struct ggml_tensor* chunk_hidden_states    = NULL;  // [n_token*2, 4096]
-        struct ggml_tensor* chunk_hidden_states_l  = NULL;  // [n_token, hidden_size_l]
-        struct ggml_tensor* chunk_hidden_states_g  = NULL;  // [n_token, hidden_size_g]
-        struct ggml_tensor* chunk_hidden_states_t5 = NULL;  // [n_token, hidden_size_t5]
-        struct ggml_tensor* pooled                 = NULL;
-        struct ggml_tensor* pooled_l               = NULL;  // [768,]
-        struct ggml_tensor* pooled_g               = NULL;  // [1280,]
+        struct ggml_tensor* hidden_states          = nullptr;  // [N, n_token*2, 4096]
+        struct ggml_tensor* chunk_hidden_states    = nullptr;  // [n_token*2, 4096]
+        struct ggml_tensor* chunk_hidden_states_l  = nullptr;  // [n_token, hidden_size_l]
+        struct ggml_tensor* chunk_hidden_states_g  = nullptr;  // [n_token, hidden_size_g]
+        struct ggml_tensor* chunk_hidden_states_t5 = nullptr;  // [n_token, hidden_size_t5]
+        struct ggml_tensor* pooled                 = nullptr;
+        struct ggml_tensor* pooled_l               = nullptr;  // [768,]
+        struct ggml_tensor* pooled_g               = nullptr;  // [1280,]
         std::vector<float> hidden_states_vec;
 
         size_t chunk_len   = 77;
@@ -810,7 +810,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                 clip_l->compute(n_threads,
                                 input_ids,
                                 0,
-                                NULL,
+                                nullptr,
                                 max_token_idx,
                                 false,
                                 clip_skip,
@@ -838,7 +838,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                     clip_l->compute(n_threads,
                                     input_ids,
                                     0,
-                                    NULL,
+                                    nullptr,
                                     max_token_idx,
                                     true,
                                     clip_skip,
@@ -860,7 +860,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                 clip_g->compute(n_threads,
                                 input_ids,
                                 0,
-                                NULL,
+                                nullptr,
                                 max_token_idx,
                                 false,
                                 clip_skip,
@@ -889,7 +889,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                     clip_g->compute(n_threads,
                                     input_ids,
                                     0,
-                                    NULL,
+                                    nullptr,
                                     max_token_idx,
                                     true,
                                     clip_skip,
@@ -909,7 +909,7 @@ struct SD3CLIPEmbedder : public Conditioner {
 
                 t5->compute(n_threads,
                             input_ids,
-                            NULL,
+                            nullptr,
                             &chunk_hidden_states_t5,
                             work_ctx);
                 {
@@ -974,12 +974,12 @@ struct SD3CLIPEmbedder : public Conditioner {
                                         hidden_states,
                                         chunk_hidden_states->ne[0],
                                         ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-        return SDCondition(hidden_states, pooled, NULL);
+        return {hidden_states, pooled, nullptr};
     }
 
     SDCondition get_learned_condition(ggml_context* work_ctx,
                                       int n_threads,
-                                      const ConditionerParams& conditioner_params) {
+                                      const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, 77, true);
         return get_learned_condition_common(work_ctx,
                                             n_threads,
@@ -1003,22 +1003,22 @@ struct FluxCLIPEmbedder : public Conditioner {
         t5     = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
         t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         clip_l->alloc_params_buffer();
         t5->alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         clip_l->free_params_buffer();
         t5->free_params_buffer();
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         size_t buffer_size = clip_l->get_params_buffer_size();
         buffer_size += t5->get_params_buffer_size();
         return buffer_size;
@@ -1061,7 +1061,7 @@ struct FluxCLIPEmbedder : public Conditioner {
         }
 
         clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding);
-        t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
+        t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
 
         // for (int i = 0; i < clip_l_tokens.size(); i++) {
         //     std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@@ -1091,9 +1091,9 @@ struct FluxCLIPEmbedder : public Conditioner {
         }
 
         int64_t t0                              = ggml_time_ms();
-        struct ggml_tensor* hidden_states       = NULL;  // [N, n_token, 4096]
-        struct ggml_tensor* chunk_hidden_states = NULL;  // [n_token, 4096]
-        struct ggml_tensor* pooled              = NULL;  // [768,]
+        struct ggml_tensor* hidden_states       = nullptr;  // [N, n_token, 4096]
+        struct ggml_tensor* chunk_hidden_states = nullptr;  // [n_token, 4096]
+        struct ggml_tensor* pooled              = nullptr;  // [768,]
         std::vector<float> hidden_states_vec;
 
         size_t chunk_count = t5_tokens.size() / chunk_len;
@@ -1115,7 +1115,7 @@ struct FluxCLIPEmbedder : public Conditioner {
                 clip_l->compute(n_threads,
                                 input_ids,
                                 0,
-                                NULL,
+                                nullptr,
                                 max_token_idx,
                                 true,
                                 clip_skip,
@@ -1134,7 +1134,7 @@ struct FluxCLIPEmbedder : public Conditioner {
 
                 t5->compute(n_threads,
                             input_ids,
-                            NULL,
+                            nullptr,
                             &chunk_hidden_states,
                             work_ctx);
                 {
@@ -1173,12 +1173,12 @@ struct FluxCLIPEmbedder : public Conditioner {
                                         hidden_states,
                                         chunk_hidden_states->ne[0],
                                         ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
-        return SDCondition(hidden_states, pooled, NULL);
+        return {hidden_states, pooled, nullptr};
     }
 
     SDCondition get_learned_condition(ggml_context* work_ctx,
                                       int n_threads,
-                                      const ConditionerParams& conditioner_params) {
+                                      const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
         return get_learned_condition_common(work_ctx,
                                             n_threads,
@@ -1206,19 +1206,19 @@ struct T5CLIPEmbedder : public Conditioner {
         t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         t5->alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         t5->free_params_buffer();
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         size_t buffer_size = 0;
 
         buffer_size += t5->get_params_buffer_size();
@@ -1287,9 +1287,9 @@ struct T5CLIPEmbedder : public Conditioner {
         auto& t5_attn_mask_vec = std::get<2>(token_and_weights);
 
         int64_t t0                              = ggml_time_ms();
-        struct ggml_tensor* hidden_states       = NULL;  // [N, n_token, 4096]
-        struct ggml_tensor* chunk_hidden_states = NULL;  // [n_token, 4096]
-        struct ggml_tensor* pooled              = NULL;
+        struct ggml_tensor* hidden_states       = nullptr;  // [N, n_token, 4096]
+        struct ggml_tensor* chunk_hidden_states = nullptr;  // [n_token, 4096]
+        struct ggml_tensor* pooled              = nullptr;
         struct ggml_tensor* t5_attn_mask        = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec);  // [n_token]
 
         std::vector<float> hidden_states_vec;
@@ -1306,7 +1306,7 @@ struct T5CLIPEmbedder : public Conditioner {
                                           t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len);
 
             auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
-            auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : NULL;
+            auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : nullptr;
 
             t5->compute(n_threads,
                         input_ids,
@@ -1358,12 +1358,12 @@ struct T5CLIPEmbedder : public Conditioner {
 
         modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
 
-        return SDCondition(hidden_states, t5_attn_mask, NULL);
+        return {hidden_states, t5_attn_mask, nullptr};
     }
 
     SDCondition get_learned_condition(ggml_context* work_ctx,
                                       int n_threads,
-                                      const ConditionerParams& conditioner_params) {
+                                      const ConditionerParams& conditioner_params) override {
         auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
         return get_learned_condition_common(work_ctx,
                                             n_threads,
@@ -1389,19 +1389,19 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
                                                           enable_vision);
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         qwenvl->get_param_tensors(tensors, "text_encoders.qwen2vl");
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         qwenvl->alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         qwenvl->free_params_buffer();
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         size_t buffer_size = 0;
         buffer_size += qwenvl->get_params_buffer_size();
         return buffer_size;
@@ -1454,7 +1454,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
 
     SDCondition get_learned_condition(ggml_context* work_ctx,
                                       int n_threads,
-                                      const ConditionerParams& conditioner_params) {
+                                      const ConditionerParams& conditioner_params) override {
         std::string prompt;
         std::vector<std::pair<int, ggml_tensor*>> image_embeds;
         size_t system_prompt_length          = 0;
@@ -1530,7 +1530,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
         auto& weights           = std::get<1>(tokens_and_weights);
 
         int64_t t0                        = ggml_time_ms();
-        struct ggml_tensor* hidden_states = NULL;  // [N, n_token, 3584]
+        struct ggml_tensor* hidden_states = nullptr;  // [N, n_token, 3584]
 
         auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
 
@@ -1570,7 +1570,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
 
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        return SDCondition(new_hidden_states, nullptr, nullptr);
+        return {new_hidden_states, nullptr, nullptr};
     }
 };
 
diff --git a/control.hpp b/control.hpp
index 79b82a22..1f231f93 100644
--- a/control.hpp
+++ b/control.hpp
@@ -206,18 +206,18 @@ class ControlNetBlock : public GGMLBlock {
                                              struct ggml_tensor* guided_hint,
                                              struct ggml_tensor* timesteps,
                                              struct ggml_tensor* context,
-                                             struct ggml_tensor* y = NULL) {
+                                             struct ggml_tensor* y = nullptr) {
         // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
         // timesteps: [N,]
         // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
         // y: [N, adm_in_channels] or [1, adm_in_channels]
-        if (context != NULL) {
+        if (context != nullptr) {
             if (context->ne[2] != x->ne[3]) {
                 context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
             }
         }
 
-        if (y != NULL) {
+        if (y != nullptr) {
             if (y->ne[1] != x->ne[3]) {
                 y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
             }
@@ -237,7 +237,7 @@ class ControlNetBlock : public GGMLBlock {
         emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]
 
         // SDXL/SVD
-        if (y != NULL) {
+        if (y != nullptr) {
             auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
             auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
 
@@ -250,7 +250,7 @@ class ControlNetBlock : public GGMLBlock {
 
         std::vector<struct ggml_tensor*> outs;
 
-        if (guided_hint == NULL) {
+        if (guided_hint == nullptr) {
             guided_hint = input_hint_block_forward(ctx, hint, emb, context);
         }
         outs.push_back(guided_hint);
@@ -312,10 +312,10 @@ struct ControlNet : public GGMLRunner {
     SDVersion version = VERSION_SD1;
     ControlNetBlock control_net;
 
-    ggml_backend_buffer_t control_buffer = NULL;  // keep control output tensors in backend memory
-    ggml_context* control_ctx            = NULL;
+    ggml_backend_buffer_t control_buffer = nullptr;  // keep control output tensors in backend memory
+    ggml_context* control_ctx            = nullptr;
     std::vector<struct ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
-    struct ggml_tensor* guided_hint = NULL;     // guided_hint cache, for faster inference
+    struct ggml_tensor* guided_hint = nullptr;  // guided_hint cache, for faster inference
     bool guided_hint_cached         = false;
 
     ControlNet(ggml_backend_t backend,
@@ -337,14 +337,14 @@ struct ControlNet : public GGMLRunner {
         }
     }
 
-    ~ControlNet() {
+    ~ControlNet() override {
         free_control_ctx();
     }
 
     void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = true;
         control_ctx       = ggml_init(params);
 
@@ -366,20 +366,20 @@ struct ControlNet : public GGMLRunner {
     }
 
     void free_control_ctx() {
-        if (control_buffer != NULL) {
+        if (control_buffer != nullptr) {
             ggml_backend_buffer_free(control_buffer);
-            control_buffer = NULL;
+            control_buffer = nullptr;
         }
-        if (control_ctx != NULL) {
+        if (control_ctx != nullptr) {
             ggml_free(control_ctx);
-            control_ctx = NULL;
+            control_ctx = nullptr;
         }
-        guided_hint        = NULL;
+        guided_hint        = nullptr;
         guided_hint_cached = false;
         controls.clear();
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "control_net";
     }
 
@@ -391,12 +391,12 @@ struct ControlNet : public GGMLRunner {
                                     struct ggml_tensor* hint,
                                     struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
-                                    struct ggml_tensor* y = NULL) {
+                                    struct ggml_tensor* y = nullptr) {
         struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
 
         x = to_backend(x);
         if (guided_hint_cached) {
-            hint = NULL;
+            hint = nullptr;
         } else {
             hint = to_backend(hint);
         }
@@ -408,12 +408,12 @@ struct ControlNet : public GGMLRunner {
                                         runtime_backend,
                                         x,
                                         hint,
-                                        guided_hint_cached ? guided_hint : NULL,
+                                        guided_hint_cached ? guided_hint : nullptr,
                                         timesteps,
                                         context,
                                         y);
 
-        if (control_ctx == NULL) {
+        if (control_ctx == nullptr) {
             alloc_control_ctx(outs);
         }
 
@@ -431,8 +431,8 @@ struct ControlNet : public GGMLRunner {
                  struct ggml_tensor* timesteps,
                  struct ggml_tensor* context,
                  struct ggml_tensor* y,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
diff --git a/denoiser.hpp b/denoiser.hpp
index 3c53301b..cb2010ca 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -19,7 +19,7 @@ struct SigmaSchedule {
 };
 
 struct DiscreteSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
         std::vector<float> result;
 
         int t_max = TIMESTEPS - 1;
@@ -43,7 +43,7 @@ struct DiscreteSchedule : SigmaSchedule {
 };
 
 struct ExponentialSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
         std::vector<float> sigmas;
 
         // Calculate step size
@@ -150,7 +150,7 @@ std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
 https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
 */
 struct AYSSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
         const std::vector<float> noise_levels[] = {
             /* SD1.5 */
             {14.6146412293f, 6.4745760956f, 3.8636745985f, 2.6946151520f,
@@ -204,7 +204,7 @@ struct AYSSchedule : SigmaSchedule {
  * GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main
  */
 struct GITSSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
         if (sigma_max <= 0.0f) {
             return std::vector<float>{};
         }
@@ -252,7 +252,7 @@ struct SGMUniformSchedule : SigmaSchedule {
 };
 
 struct KarrasSchedule : SigmaSchedule {
-    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
         // These *COULD* be function arguments here,
         // but does anybody ever bother to touch them?
         float rho = 7.f;
@@ -350,15 +350,15 @@ struct CompVisDenoiser : public Denoiser {
 
     float sigma_data = 1.0f;
 
-    float sigma_min() {
+    float sigma_min() override {
         return sigmas[0];
     }
 
-    float sigma_max() {
+    float sigma_max() override {
         return sigmas[TIMESTEPS - 1];
     }
 
-    float sigma_to_t(float sigma) {
+    float sigma_to_t(float sigma) override {
         float log_sigma = std::log(sigma);
         std::vector<float> dists;
         dists.reserve(TIMESTEPS);
@@ -384,7 +384,7 @@ struct CompVisDenoiser : public Denoiser {
         return t;
     }
 
-    float t_to_sigma(float t) {
+    float t_to_sigma(float t) override {
         int low_idx     = static_cast<int>(std::floor(t));
         int high_idx    = static_cast<int>(std::ceil(t));
         float w         = t - static_cast<float>(low_idx);
@@ -392,7 +392,7 @@ struct CompVisDenoiser : public Denoiser {
         return std::exp(log_sigma);
     }
 
-    std::vector<float> get_scalings(float sigma) {
+    std::vector<float> get_scalings(float sigma) override {
         float c_skip = 1.0f;
         float c_out  = -sigma;
         float c_in   = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
@@ -400,19 +400,19 @@ struct CompVisDenoiser : public Denoiser {
     }
 
     // this function will modify noise/latent
-    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
+    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
         ggml_tensor_scale(noise, sigma);
         ggml_tensor_add(latent, noise);
         return latent;
     }
 
-    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
+    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
         return latent;
     }
 };
 
 struct CompVisVDenoiser : public CompVisDenoiser {
-    std::vector<float> get_scalings(float sigma) {
+    std::vector<float> get_scalings(float sigma) override {
         float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
         float c_out  = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
         float c_in   = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
@@ -429,19 +429,19 @@ struct EDMVDenoiser : public CompVisVDenoiser {
         scheduler = std::make_shared<ExponentialSchedule>();
     }
 
-    float t_to_sigma(float t) {
+    float t_to_sigma(float t) override {
         return std::exp(t * 4 / (float)TIMESTEPS);
     }
 
-    float sigma_to_t(float s) {
+    float sigma_to_t(float s) override {
         return 0.25 * std::log(s);
     }
 
-    float sigma_min() {
+    float sigma_min() override {
         return min_sigma;
     }
 
-    float sigma_max() {
+    float sigma_max() override {
         return max_sigma;
     }
 };
@@ -470,24 +470,24 @@ struct DiscreteFlowDenoiser : public Denoiser {
         }
     }
 
-    float sigma_min() {
+    float sigma_min() override {
         return sigmas[0];
     }
 
-    float sigma_max() {
+    float sigma_max() override {
         return sigmas[TIMESTEPS - 1];
     }
 
-    float sigma_to_t(float sigma) {
+    float sigma_to_t(float sigma) override {
         return sigma * 1000.f;
     }
 
-    float t_to_sigma(float t) {
+    float t_to_sigma(float t) override {
         t = t + 1;
         return time_snr_shift(shift, t / 1000.f);
     }
 
-    std::vector<float> get_scalings(float sigma) {
+    std::vector<float> get_scalings(float sigma) override {
         float c_skip = 1.0f;
         float c_out  = -sigma;
         float c_in   = 1.0f;
@@ -495,14 +495,14 @@ struct DiscreteFlowDenoiser : public Denoiser {
     }
 
     // this function will modify noise/latent
-    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
+    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
         ggml_tensor_scale(noise, sigma);
         ggml_tensor_scale(latent, 1.0f - sigma);
         ggml_tensor_add(latent, noise);
         return latent;
     }
 
-    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
+    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
         ggml_tensor_scale(latent, 1.0f / (1.0f - sigma));
         return latent;
     }
@@ -529,24 +529,24 @@ struct FluxFlowDenoiser : public Denoiser {
         }
     }
 
-    float sigma_min() {
+    float sigma_min() override {
         return sigmas[0];
     }
 
-    float sigma_max() {
+    float sigma_max() override {
         return sigmas[TIMESTEPS - 1];
     }
 
-    float sigma_to_t(float sigma) {
+    float sigma_to_t(float sigma) override {
         return sigma;
     }
 
-    float t_to_sigma(float t) {
+    float t_to_sigma(float t) override {
         t = t + 1;
         return flux_time_shift(shift, 1.0f, t / TIMESTEPS);
     }
 
-    std::vector<float> get_scalings(float sigma) {
+    std::vector<float> get_scalings(float sigma) override {
         float c_skip = 1.0f;
         float c_out  = -sigma;
         float c_in   = 1.0f;
@@ -554,14 +554,14 @@ struct FluxFlowDenoiser : public Denoiser {
     }
 
     // this function will modify noise/latent
-    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
+    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
         ggml_tensor_scale(noise, sigma);
         ggml_tensor_scale(latent, 1.0f - sigma);
         ggml_tensor_add(latent, noise);
         return latent;
     }
 
-    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
+    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
         ggml_tensor_scale(latent, 1.0f / (1.0f - sigma));
         return latent;
     }
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 6c38b58a..94b29bf1 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -8,18 +8,18 @@
 #include "wan.hpp"
 
 struct DiffusionParams {
-    struct ggml_tensor* x                     = NULL;
-    struct ggml_tensor* timesteps             = NULL;
-    struct ggml_tensor* context               = NULL;
-    struct ggml_tensor* c_concat              = NULL;
-    struct ggml_tensor* y                     = NULL;
-    struct ggml_tensor* guidance              = NULL;
+    struct ggml_tensor* x                     = nullptr;
+    struct ggml_tensor* timesteps             = nullptr;
+    struct ggml_tensor* context               = nullptr;
+    struct ggml_tensor* c_concat              = nullptr;
+    struct ggml_tensor* y                     = nullptr;
+    struct ggml_tensor* guidance              = nullptr;
     std::vector<ggml_tensor*> ref_latents     = {};
     bool increase_ref_index                   = false;
     int num_video_frames                      = -1;
     std::vector<struct ggml_tensor*> controls = {};
     float control_strength                    = 0.f;
-    struct ggml_tensor* vace_context          = NULL;
+    struct ggml_tensor* vace_context          = nullptr;
     float vace_strength                       = 1.f;
     std::vector<int> skip_layers              = {};
 };
@@ -28,8 +28,8 @@ struct DiffusionModel {
     virtual std::string get_desc()                                                      = 0;
     virtual void compute(int n_threads,
                          DiffusionParams diffusion_params,
-                         struct ggml_tensor** output     = NULL,
-                         struct ggml_context* output_ctx = NULL)                        = 0;
+                         struct ggml_tensor** output     = nullptr,
+                         struct ggml_context* output_ctx = nullptr)                     = 0;
     virtual void alloc_params_buffer()                                                  = 0;
     virtual void free_params_buffer()                                                   = 0;
     virtual void free_compute_buffer()                                                  = 0;
@@ -49,38 +49,38 @@ struct UNetModel : public DiffusionModel {
         : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return unet.get_desc();
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         unet.alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         unet.free_params_buffer();
     }
 
-    void free_compute_buffer() {
+    void free_compute_buffer() override {
         unet.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         unet.get_param_tensors(tensors, "model.diffusion_model");
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         return unet.get_params_buffer_size();
     }
 
-    int64_t get_adm_in_channels() {
+    int64_t get_adm_in_channels() override {
         return unet.unet.adm_in_channels;
     }
 
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
         return unet.compute(n_threads,
                             diffusion_params.x,
                             diffusion_params.timesteps,
@@ -103,38 +103,38 @@ struct MMDiTModel : public DiffusionModel {
         : mmdit(backend, offload_params_to_cpu, flash_attn, tensor_types, "model.diffusion_model") {
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return mmdit.get_desc();
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         mmdit.alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         mmdit.free_params_buffer();
     }
 
-    void free_compute_buffer() {
+    void free_compute_buffer() override {
         mmdit.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         mmdit.get_param_tensors(tensors, "model.diffusion_model");
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         return mmdit.get_params_buffer_size();
     }
 
-    int64_t get_adm_in_channels() {
+    int64_t get_adm_in_channels() override {
         return 768 + 1280;
     }
 
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
         return mmdit.compute(n_threads,
                              diffusion_params.x,
                              diffusion_params.timesteps,
@@ -158,38 +158,38 @@ struct FluxModel : public DiffusionModel {
         : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return flux.get_desc();
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         flux.alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         flux.free_params_buffer();
     }
 
-    void free_compute_buffer() {
+    void free_compute_buffer() override {
         flux.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         flux.get_param_tensors(tensors, "model.diffusion_model");
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         return flux.get_params_buffer_size();
     }
 
-    int64_t get_adm_in_channels() {
+    int64_t get_adm_in_channels() override {
         return 768;
     }
 
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
         return flux.compute(n_threads,
                             diffusion_params.x,
                             diffusion_params.timesteps,
@@ -218,45 +218,45 @@ struct WanModel : public DiffusionModel {
         : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return wan.get_desc();
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         wan.alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         wan.free_params_buffer();
     }
 
-    void free_compute_buffer() {
+    void free_compute_buffer() override {
         wan.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         wan.get_param_tensors(tensors, prefix);
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         return wan.get_params_buffer_size();
     }
 
-    int64_t get_adm_in_channels() {
+    int64_t get_adm_in_channels() override {
         return 768;
     }
 
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
         return wan.compute(n_threads,
                            diffusion_params.x,
                            diffusion_params.timesteps,
                            diffusion_params.context,
                            diffusion_params.y,
                            diffusion_params.c_concat,
-                           NULL,
+                           nullptr,
                            diffusion_params.vace_context,
                            diffusion_params.vace_strength,
                            output,
@@ -277,38 +277,38 @@ struct QwenImageModel : public DiffusionModel {
         : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return qwen_image.get_desc();
     }
 
-    void alloc_params_buffer() {
+    void alloc_params_buffer() override {
         qwen_image.alloc_params_buffer();
     }
 
-    void free_params_buffer() {
+    void free_params_buffer() override {
         qwen_image.free_params_buffer();
     }
 
-    void free_compute_buffer() {
+    void free_compute_buffer() override {
         qwen_image.free_compute_buffer();
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
         qwen_image.get_param_tensors(tensors, prefix);
     }
 
-    size_t get_params_buffer_size() {
+    size_t get_params_buffer_size() override {
         return qwen_image.get_params_buffer_size();
     }
 
-    int64_t get_adm_in_channels() {
+    int64_t get_adm_in_channels() override {
         return 768;
     }
 
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
         return qwen_image.compute(n_threads,
                                   diffusion_params.x,
                                   diffusion_params.timesteps,
diff --git a/esrgan.hpp b/esrgan.hpp
index fe5f16d2..fa18532d 100644
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -156,9 +156,10 @@ struct ESRGAN : public GGMLRunner {
 
     ESRGAN(ggml_backend_t backend,
            bool offload_params_to_cpu,
+           int tile_size = 128,
            const String2GGMLType& tensor_types = {})
         : GGMLRunner(backend, offload_params_to_cpu) {
-        // rrdb_net will be created in load_from_file
+        this->tile_size = tile_size;
     }
 
     void enable_conv2d_direct() {
@@ -174,7 +175,7 @@ struct ESRGAN : public GGMLRunner {
         }
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "esrgan";
     }
 
@@ -367,7 +368,7 @@ struct ESRGAN : public GGMLRunner {
     void compute(const int n_threads,
                  struct ggml_tensor* x,
                  ggml_tensor** output,
-                 ggml_context* output_ctx = NULL) {
+                 ggml_context* output_ctx = nullptr) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
             return build_graph(x);
         };
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 6e8ddd48..ee17d17d 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -1,113 +1,110 @@
 # Run
 
 ```
-usage: ./bin/sd [arguments]
+usage: ./bin/sd  [options]
 
-arguments:
-  -h, --help                         show this help message and exit
-  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen
-  -t, --threads N                    number of threads to use during computation (default: -1)
-                                     If threads <= 0, then threads will be set to the number of CPU physical cores
-  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  -m, --model [MODEL]                path to full model
-  --diffusion-model                  path to the standalone diffusion model
-  --high-noise-diffusion-model       path to the standalone high noise diffusion model
-  --clip_l                           path to the clip-l text encoder
-  --clip_g                           path to the clip-g text encoder
-  --clip_vision                      path to the clip-vision encoder
-  --t5xxl                            path to the t5xxl text encoder
-  --qwen2vl                          path to the qwen2vl text encoder
-  --qwen2vl_vision                   path to the qwen2vl vit
-  --vae [VAE]                        path to vae
-  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --control-net [CONTROL_PATH]       path to control net model
-  --embd-dir [EMBEDDING_PATH]        path to embeddings
-  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
-  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
-  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
-                                     If not specified, the default is the type of the weight file
-  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
-  --lora-model-dir [DIR]             lora model directory
-  -i, --init-img [IMAGE]             path to the init image, required by img2img
-  --mask [MASK]                      path to the mask image, required by img2img with mask
-  -i, --end-img [IMAGE]              path to the end image, required by flf2v
-  --control-image [IMAGE]            path to image condition, control net
-  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
-  --control-video [PATH]             path to control video frames, It must be a directory path.
-                                     The video frames inside should be stored as images in lexicographical (character) order
-                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
-  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).
-  -o, --output OUTPUT                path to write result image to (default: ./output.png)
-  -p, --prompt [PROMPT]              the prompt to render
-  -n, --negative-prompt PROMPT       the negative prompt (default: "")
-  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
-  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)
-  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
-  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)
-  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])
-  --skip-layer-start START           SLG enabling point: (default: 0.01)
-  --skip-layer-end END               SLG disabling point: (default: 0.2)
-  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
-  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
-                                     sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
-  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
-  --steps  STEPS                     number of sample steps (default: 20)
-  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)
-  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --high-noise-guidance SCALE        (high noise) distilled guidance scale for models with guidance input (default: 3.5)
-  --high-noise-slg-scale SCALE       (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
-  --high-noise-eta SCALE             (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
-  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])
-  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)
-  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)
-  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
-  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
-                                     (high noise) sampling method (default: "euler_a")
-  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
-                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
-  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
-  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
-                                     1.0 corresponds to full destruction of information in init image
-  -H, --height H                     image height, in pixel space (default: 512)
-  -W, --width W                      image width, in pixel space (default: 512)
-  --rng {std_default, cuda}          RNG (default: cuda)
-  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
-  -b, --batch-count COUNT            number of images to generate
-  --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override
-  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
-                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-  --vae-tiling                       process vae in tiles to reduce memory usage
-  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)
-  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
-  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae
-  --vae-on-cpu                       keep vae in cpu (for low vram)
-  --clip-on-cpu                      keep clip in cpu (for low vram)
-  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
-                                     Might lower quality, since it implies converting k and v to f16.
-                                     This might crash if it is not supported by the backend.
-  --diffusion-conv-direct            use Conv2d direct in the diffusion model
-                                     This might crash if it is not supported by the backend.
-  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)
-                                     This might crash if it is not supported by the backend.
-  --control-net-cpu                  keep controlnet in cpu (for low vram)
-  --canny                            apply canny preprocessor (edge detection)
-  --color                            colors the logging tags according to level
-  --chroma-disable-dit-mask          disable dit mask for chroma
-  --chroma-enable-t5-mask            enable t5 mask for chroma
-  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma
-  --video-frames                     video frames (default: 1)
-  --fps                              fps (default: 24)
-  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)
-                                     only enabled if `--high-noise-steps` is set to -1
-  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)
-  --vace-strength                    wan vace strength
-  --photo-maker                      path to PHOTOMAKER model
-  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir
-  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed
-  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)
-  -v, --verbose                      print extra info
+Options:
+  -m, --model <string>                     path to full model
+  --clip_l <string>                        path to the clip-l text encoder
+  --clip_g <string>                        path to the clip-g text encoder
+  --clip_vision <string>                   path to the clip-vision encoder
+  --t5xxl <string>                         path to the t5xxl text encoder
+  --qwen2vl <string>                       path to the qwen2vl text encoder
+  --qwen2vl_vision <string>                path to the qwen2vl vit
+  --diffusion-model <string>               path to the standalone diffusion model
+  --high-noise-diffusion-model <string>    path to the standalone high noise diffusion model
+  --vae <string>                           path to standalone vae model
+  --taesd <string>                         path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+  --control-net <string>                   path to control net model
+  --embd-dir <string>                      embeddings directory
+  --lora-model-dir <string>                lora model directory
+  -i, --init-img <string>                  path to the init image
+  --end-img <string>                       path to the end image, required by flf2v
+  --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
+  --photo-maker <string>                   path to PHOTOMAKER model
+  --pm-id-images-dir <string>              path to PHOTOMAKER input id images dir
+  --pm-id-embed-path <string>              path to PHOTOMAKER v2 id embed
+  --mask <string>                          path to the mask image
+  --control-image <string>                 path to control image, control net
+  --control-video <string>                 path to control video frames, It must be a directory path. The video frames inside should be stored as images in
+                                           lexicographical (character) order. For example, if the control video path is
+                                           `frames`, the directory contain images such as 00.png, 01.png, ... etc.
+  -o, --output <string>                    path to write result image to (default: ./output.png)
+  -p, --prompt <string>                    the prompt to render
+  -n, --negative-prompt <string>           the negative prompt (default: "")
+  --upscale-model <string>                 path to esrgan model.
+  -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
+                                           CPU physical cores
+  --upscale-repeats <int>                  Run the ESRGAN upscaler this many times (default: 1)
+  -H, --height <int>                       image height, in pixel space (default: 512)
+  -W, --width <int>                        image width, in pixel space (default: 512)
+  --steps <int>                            number of sample steps (default: 20)
+  --high-noise-steps <int>                 (high noise) number of sample steps (default: -1 = auto)
+  --clip-skip <int>                        ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). <= 0 represents unspecified,
+                                           will be 1 for SD1.x, 2 for SD2.x
+  -b, --batch-count <int>                  batch count
+  --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
+  --video-frames <int>                     video frames (default: 1)
+  --fps <int>                              fps (default: 24)
+  --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
+                                           NitroSD-Vibrant
+  --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
+  --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
+  --slg-scale <float>                      skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5
+                                           medium
+  --skip-layer-start <float>               SLG enabling point (default: 0.01)
+  --skip-layer-end <float>                 SLG disabling point (default: 0.2)
+  --eta <float>                            eta in DDIM, only for DDIM and TCD (default: 0)
+  --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
+  --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
+  --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
+  --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+  --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
+  --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
+  --high-noise-eta <float>                 (high noise) eta in DDIM, only for DDIM and TCD (default: 0)
+  --strength <float>                       strength for noising/unnoising (default: 0.75)
+  --pm-style-strength <float>
+  --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
+  --moe-boundary <float>                   timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1
+  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
+  --vace-strength <float>                  wan vace strength
+  --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --vae-tiling                             process vae in tiles to reduce memory usage
+  --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
+  --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  --control-net-cpu                        keep controlnet in cpu (for low vram)
+  --clip-on-cpu                            keep clip in cpu (for low vram)
+  --vae-on-cpu                             keep vae in cpu (for low vram)
+  --diffusion-fa                           use flash attention in the diffusion model
+  --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
+  --vae-conv-direct                        use ggml_conv2d_direct in the vae model
+  --canny                                  apply canny preprocessor (edge detection)
+  -v, --verbose                            print extra info
+  --color                                  colors the logging tags according to level
+  --chroma-disable-dit-mask                disable dit mask for chroma
+  --chroma-enable-t5-mask                  enable t5 mask for chroma
+  --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
+  --disable-auto-resize-ref-image          disable auto resize of ref images
+  -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
+  --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
+                                           type of the weight file
+  --rng                                    RNG, one of [std_default, cuda], default: cuda
+  -s, --seed                               RNG seed (default: 42, use random seed for < 0)
+  --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
+                                           tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
+  --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]
+  --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
+                                           discrete
+  --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
+  --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
+                                           ddim_trailing, tcd] default: euler for Flux/SD3/Wan, euler_a otherwise
+  --high-noise-scheduler                   (high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform,
+                                           simple], default: discrete
+  --high-noise-skip-layers                 (high noise) layers to skip for SLG steps (default: [7,8,9])
+  -r, --ref-image                          reference image for Flux Kontext models (can be used multiple times)
+  -h, --help                               show this help message and exit
+  --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
+  --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
+                                           (overrides --vae-tile-size)
 ```
\ No newline at end of file
diff --git a/examples/cli/avi_writer.h b/examples/cli/avi_writer.h
index 8cfb9a57..84b204af 100644
--- a/examples/cli/avi_writer.h
+++ b/examples/cli/avi_writer.h
@@ -1,10 +1,10 @@
 #ifndef __AVI_WRITER_H__
 #define __AVI_WRITER_H__
 
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
 
 #include "stable-diffusion.h"
 
@@ -130,7 +130,7 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
     write_u32_le(f, 0);                   // Colors important
 
     // 'movi' LIST (video frames)
-    long movi_list_pos = ftell(f);
+    // long movi_list_pos = ftell(f);
     fwrite("LIST", 4, 1, f);
     long movi_size_pos = ftell(f);
     write_u32_le(f, 0);  // Placeholder for movi size
@@ -149,7 +149,7 @@ int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int
     } jpeg_data;
 
     for (int i = 0; i < num_images; i++) {
-        jpeg_data.buf  = NULL;
+        jpeg_data.buf  = nullptr;
         jpeg_data.size = 0;
 
         // Callback function to collect JPEG data into memory
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ff36cea2..24f81032 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -7,6 +7,7 @@
 #include <map>
 #include <random>
 #include <regex>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -80,7 +81,8 @@ struct SDParams {
     std::string control_image_path;
     std::vector<std::string> ref_image_paths;
     std::string control_video_path;
-    bool increase_ref_index = false;
+    bool auto_resize_ref_image = true;
+    bool increase_ref_index    = false;
 
     std::string prompt;
     std::string negative_prompt;
@@ -116,6 +118,7 @@ struct SDParams {
     bool canny_preprocess      = false;
     bool color                 = false;
     int upscale_repeats        = 1;
+    int upscale_tile_size      = 128;
 
     // Photo Maker
     std::string photo_maker_path;
@@ -175,6 +178,7 @@ void print_params(SDParams params) {
         printf("        %s\n", path.c_str());
     };
     printf("    control_video_path:                %s\n", params.control_video_path.c_str());
+    printf("    auto_resize_ref_image:             %s\n", params.auto_resize_ref_image ? "true" : "false");
     printf("    increase_ref_index:                %s\n", params.increase_ref_index ? "true" : "false");
     printf("    offload_params_to_cpu:             %s\n", params.offload_params_to_cpu ? "true" : "false");
     printf("    clip_on_cpu:                       %s\n", params.clip_on_cpu ? "true" : "false");
@@ -201,6 +205,7 @@ void print_params(SDParams params) {
     printf("    vae_tiling:                        %s\n", params.vae_tiling_params.enabled ? "true" : "false");
     printf("    force_sdxl_vae_conv_scale:         %s\n", params.force_sdxl_vae_conv_scale ? "true" : "false");
     printf("    upscale_repeats:                   %d\n", params.upscale_repeats);
+    printf("    upscale_tile_size:                 %d\n", params.upscale_tile_size);
     printf("    chroma_use_dit_mask:               %s\n", params.chroma_use_dit_mask ? "true" : "false");
     printf("    chroma_use_t5_mask:                %s\n", params.chroma_use_t5_mask ? "true" : "false");
     printf("    chroma_t5_mask_pad:                %d\n", params.chroma_t5_mask_pad);
@@ -211,118 +216,6 @@ void print_params(SDParams params) {
     free(high_noise_sample_params_str);
 }
 
-void print_usage(int argc, const char* argv[]) {
-    printf("usage: %s [arguments]\n", argv[0]);
-    printf("\n");
-    printf("arguments:\n");
-    printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen\n");
-    printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
-    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
-    printf("  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
-    printf("  -m, --model [MODEL]                path to full model\n");
-    printf("  --diffusion-model                  path to the standalone diffusion model\n");
-    printf("  --high-noise-diffusion-model       path to the standalone high noise diffusion model\n");
-    printf("  --clip_l                           path to the clip-l text encoder\n");
-    printf("  --clip_g                           path to the clip-g text encoder\n");
-    printf("  --clip_vision                      path to the clip-vision encoder\n");
-    printf("  --t5xxl                            path to the t5xxl text encoder\n");
-    printf("  --qwen2vl                          path to the qwen2vl text encoder\n");
-    printf("  --qwen2vl_vision                   path to the qwen2vl vit\n");
-    printf("  --vae [VAE]                        path to vae\n");
-    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
-    printf("  --control-net [CONTROL_PATH]       path to control net model\n");
-    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
-    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
-    printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
-    printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
-    printf("                                     If not specified, the default is the type of the weight file\n");
-    printf("  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n");
-    printf("  --lora-model-dir [DIR]             lora model directory\n");
-    printf("  -i, --init-img [IMAGE]             path to the init image, required by img2img\n");
-    printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
-    printf("  -i, --end-img [IMAGE]              path to the end image, required by flf2v\n");
-    printf("  --control-image [IMAGE]            path to image condition, control net\n");
-    printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
-    printf("  --control-video [PATH]             path to control video frames, It must be a directory path.\n");
-    printf("                                     The video frames inside should be stored as images in lexicographical (character) order\n");
-    printf("                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, … etc.\n");
-    printf("  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).\n");
-    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
-    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
-    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
-    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
-    printf("  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
-    printf("  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)\n");
-    printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
-    printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
-    printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
-    printf("  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])\n");
-    printf("  --skip-layer-start START           SLG enabling point: (default: 0.01)\n");
-    printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
-    printf("  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
-    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
-    printf("                                     sampling method (default: \"euler\" for Flux/SD3/Wan, \"euler_a\" otherwise)\n");
-    printf("  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n");
-    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
-    printf("  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)\n");
-    printf("  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
-    printf("  --high-noise-guidance SCALE        (high noise) distilled guidance scale for models with guidance input (default: 3.5)\n");
-    printf("  --high-noise-slg-scale SCALE       (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
-    printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
-    printf("  --high-noise-eta SCALE             (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)\n");
-    printf("  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n");
-    printf("  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)\n");
-    printf("  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)\n");
-    printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n");
-    printf("  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
-    printf("                                     (high noise) sampling method (default: \"euler_a\")\n");
-    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
-    printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
-    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
-    printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
-    printf("                                     1.0 corresponds to full destruction of information in init image\n");
-    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
-    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
-    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
-    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
-    printf("  -b, --batch-count COUNT            number of images to generate\n");
-    printf("  --prediction {eps, v, edm_v, sd3_flow, flux_flow}        Prediction type override.\n");
-    printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
-    printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
-    printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
-    printf("  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)\n");
-    printf("  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)\n");
-    printf("  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)\n");
-    printf("  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae\n");
-    printf("  --vae-on-cpu                       keep vae in cpu (for low vram)\n");
-    printf("  --clip-on-cpu                      keep clip in cpu (for low vram)\n");
-    printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
-    printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
-    printf("                                     This might crash if it is not supported by the backend.\n");
-    printf("  --diffusion-conv-direct            use Conv2d direct in the diffusion model\n");
-    printf("                                     This might crash if it is not supported by the backend.\n");
-    printf("  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)\n");
-    printf("                                     This might crash if it is not supported by the backend.\n");
-    printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
-    printf("  --canny                            apply canny preprocessor (edge detection)\n");
-    printf("  --color                            colors the logging tags according to level\n");
-    printf("  --chroma-disable-dit-mask          disable dit mask for chroma\n");
-    printf("  --chroma-enable-t5-mask            enable t5 mask for chroma\n");
-    printf("  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma\n");
-    printf("  --video-frames                     video frames (default: 1)\n");
-    printf("  --fps                              fps (default: 24)\n");
-    printf("  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)\n");
-    printf("                                     only enabled if `--high-noise-steps` is set to -1\n");
-    printf("  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)\n");
-    printf("  --vace-strength                    wan vace strength\n");
-    printf("  --photo-maker                      path to PHOTOMAKER model\n");
-    printf("  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir\n");
-    printf("  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed\n");
-    printf("  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)\n");
-    printf("  -v, --verbose                      print extra info\n");
-}
-
 #if defined(_WIN32)
 static std::string utf16_to_utf8(const std::wstring& wstr) {
     if (wstr.empty())
@@ -492,93 +385,428 @@ bool parse_options(int argc, const char** argv, ArgOptions& options) {
     return true;
 }
 
+static std::string wrap_text(const std::string& text, size_t width, size_t indent) {
+    std::ostringstream oss;
+    size_t line_len = 0;
+    size_t pos      = 0;
+
+    while (pos < text.size()) {
+        // Preserve manual newlines
+        if (text[pos] == '\n') {
+            oss << '\n'
+                << std::string(indent, ' ');
+            line_len = indent;
+            ++pos;
+            continue;
+        }
+
+        // Add the character
+        oss << text[pos];
+        ++line_len;
+        ++pos;
+
+        // If the current line exceeds width, try to break at the last space
+        if (line_len >= width) {
+            std::string current = oss.str();
+            size_t back         = current.size();
+
+            // Find the last space (for a clean break)
+            while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
+                --back;
+
+            // If found a space to break on
+            if (back > 0 && current[back - 1] != '\n') {
+                std::string before = current.substr(0, back - 1);
+                std::string after  = current.substr(back);
+                oss.str("");
+                oss.clear();
+                oss << before << "\n"
+                    << std::string(indent, ' ') << after;
+            } else {
+                // If no space found, just break at width
+                oss << "\n"
+                    << std::string(indent, ' ');
+            }
+            line_len = indent;
+        }
+    }
+
+    return oss.str();
+}
+
+void print_usage(int argc, const char* argv[], const ArgOptions& options) {
+    constexpr size_t max_line_width = 120;
+
+    std::cout << "Usage: " << argv[0] << " [options]\n\n";
+    std::cout << "Options:\n";
+
+    struct Entry {
+        std::string names;
+        std::string desc;
+    };
+    std::vector<Entry> entries;
+
+    auto add_entry = [&](const std::string& s, const std::string& l,
+                         const std::string& desc, const std::string& hint = "") {
+        std::ostringstream ss;
+        if (!s.empty())
+            ss << s;
+        if (!s.empty() && !l.empty())
+            ss << ", ";
+        if (!l.empty())
+            ss << l;
+        if (!hint.empty())
+            ss << " " << hint;
+        entries.push_back({ss.str(), desc});
+    };
+
+    for (auto& o : options.string_options)
+        add_entry(o.short_name, o.long_name, o.desc, "<string>");
+    for (auto& o : options.int_options)
+        add_entry(o.short_name, o.long_name, o.desc, "<int>");
+    for (auto& o : options.float_options)
+        add_entry(o.short_name, o.long_name, o.desc, "<float>");
+    for (auto& o : options.bool_options)
+        add_entry(o.short_name, o.long_name, o.desc, "");
+    for (auto& o : options.manual_options)
+        add_entry(o.short_name, o.long_name, o.desc);
+
+    size_t max_name_width = 0;
+    for (auto& e : entries)
+        max_name_width = std::max(max_name_width, e.names.size());
+
+    for (auto& e : entries) {
+        size_t indent            = 2 + max_name_width + 4;
+        size_t desc_width        = (max_line_width > indent ? max_line_width - indent : 40);
+        std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent);
+        std::cout << "  " << std::left << std::setw(static_cast<int>(max_name_width) + 4)
+                  << e.names << wrapped_desc << "\n";
+    }
+}
+
 void parse_args(int argc, const char** argv, SDParams& params) {
     ArgOptions options;
     options.string_options = {
-        {"-m", "--model", "", &params.model_path},
-        {"", "--clip_l", "", &params.clip_l_path},
-        {"", "--clip_g", "", &params.clip_g_path},
-        {"", "--clip_vision", "", &params.clip_vision_path},
-        {"", "--t5xxl", "", &params.t5xxl_path},
-        {"", "--qwen2vl", "", &params.qwen2vl_path},
-        {"", "--qwen2vl_vision", "", &params.qwen2vl_vision_path},
-        {"", "--diffusion-model", "", &params.diffusion_model_path},
-        {"", "--high-noise-diffusion-model", "", &params.high_noise_diffusion_model_path},
-        {"", "--vae", "", &params.vae_path},
-        {"", "--taesd", "", &params.taesd_path},
-        {"", "--control-net", "", &params.control_net_path},
-        {"", "--embd-dir", "", &params.embedding_dir},
-        {"", "--lora-model-dir", "", &params.lora_model_dir},
-        {"-i", "--init-img", "", &params.init_image_path},
-        {"", "--end-img", "", &params.end_image_path},
-        {"", "--tensor-type-rules", "", &params.tensor_type_rules},
-        {"", "--photo-maker", "", &params.photo_maker_path},
-        {"", "--pm-id-images-dir", "", &params.pm_id_images_dir},
-        {"", "--pm-id-embed-path", "", &params.pm_id_embed_path},
-        {"", "--mask", "", &params.mask_image_path},
-        {"", "--control-image", "", &params.control_image_path},
-        {"", "--control-video", "", &params.control_video_path},
-        {"-o", "--output", "", &params.output_path},
-        {"-p", "--prompt", "", &params.prompt},
-        {"-n", "--negative-prompt", "", &params.negative_prompt},
-        {"", "--upscale-model", "", &params.esrgan_path},
+        {"-m",
+         "--model",
+         "path to full model",
+         &params.model_path},
+        {"",
+         "--clip_l",
+         "path to the clip-l text encoder", &params.clip_l_path},
+        {"", "--clip_g",
+         "path to the clip-g text encoder",
+         &params.clip_g_path},
+        {"",
+         "--clip_vision",
+         "path to the clip-vision encoder",
+         &params.clip_vision_path},
+        {"",
+         "--t5xxl",
+         "path to the t5xxl text encoder",
+         &params.t5xxl_path},
+        {"",
+         "--qwen2vl",
+         "path to the qwen2vl text encoder",
+         &params.qwen2vl_path},
+        {"",
+         "--qwen2vl_vision",
+         "path to the qwen2vl vit",
+         &params.qwen2vl_vision_path},
+        {"",
+         "--diffusion-model",
+         "path to the standalone diffusion model",
+         &params.diffusion_model_path},
+        {"",
+         "--high-noise-diffusion-model",
+         "path to the standalone high noise diffusion model",
+         &params.high_noise_diffusion_model_path},
+        {"",
+         "--vae",
+         "path to standalone vae model",
+         &params.vae_path},
+        {"",
+         "--taesd",
+         "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
+         &params.taesd_path},
+        {"",
+         "--control-net",
+         "path to control net model",
+         &params.control_net_path},
+        {"",
+         "--embd-dir",
+         "embeddings directory",
+         &params.embedding_dir},
+        {"",
+         "--lora-model-dir",
+         "lora model directory",
+         &params.lora_model_dir},
+        {"-i",
+         "--init-img",
+         "path to the init image",
+         &params.init_image_path},
+        {"",
+         "--end-img",
+         "path to the end image, required by flf2v",
+         &params.end_image_path},
+        {"",
+         "--tensor-type-rules",
+         "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
+         &params.tensor_type_rules},
+        {"",
+         "--photo-maker",
+         "path to PHOTOMAKER model",
+         &params.photo_maker_path},
+        {"",
+         "--pm-id-images-dir",
+         "path to PHOTOMAKER input id images dir",
+         &params.pm_id_images_dir},
+        {"",
+         "--pm-id-embed-path",
+         "path to PHOTOMAKER v2 id embed",
+         &params.pm_id_embed_path},
+        {"",
+         "--mask",
+         "path to the mask image",
+         &params.mask_image_path},
+        {"",
+         "--control-image",
+         "path to control image, control net",
+         &params.control_image_path},
+        {"",
+         "--control-video",
+         "path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
+         "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
+         "such as 00.png, 01.png, ... etc.",
+         &params.control_video_path},
+        {"-o",
+         "--output",
+         "path to write result image to (default: ./output.png)",
+         &params.output_path},
+        {"-p",
+         "--prompt",
+         "the prompt to render",
+         &params.prompt},
+        {"-n",
+         "--negative-prompt",
+         "the negative prompt (default: \"\")",
+         &params.negative_prompt},
+        {"",
+         "--upscale-model",
+         "path to esrgan model.",
+         &params.esrgan_path},
     };
 
     options.int_options = {
-        {"-t", "--threads", "", &params.n_threads},
-        {"", "--upscale-repeats", "", &params.upscale_repeats},
-        {"-H", "--height", "", &params.height},
-        {"-W", "--width", "", &params.width},
-        {"", "--steps", "", &params.sample_params.sample_steps},
-        {"", "--high-noise-steps", "", &params.high_noise_sample_params.sample_steps},
-        {"", "--clip-skip", "", &params.clip_skip},
-        {"-b", "--batch-count", "", &params.batch_count},
-        {"", "--chroma-t5-mask-pad", "", &params.chroma_t5_mask_pad},
-        {"", "--video-frames", "", &params.video_frames},
-        {"", "--fps", "", &params.fps},
-        {"", "--timestep-shift", "", &params.sample_params.shifted_timestep},
+        {"-t",
+         "--threads",
+         "number of threads to use during computation (default: -1). "
+         "If threads <= 0, then threads will be set to the number of CPU physical cores",
+         &params.n_threads},
+        {"",
+         "--upscale-repeats",
+         "Run the ESRGAN upscaler this many times (default: 1)",
+         &params.upscale_repeats},
+        {"",
+         "--upscale-tile-size",
+         "tile size for ESRGAN upscaling (default: 128)",
+         &params.upscale_tile_size},
+        {"-H",
+         "--height",
+         "image height, in pixel space (default: 512)",
+         &params.height},
+        {"-W",
+         "--width",
+         "image width, in pixel space (default: 512)",
+         &params.width},
+        {"",
+         "--steps",
+         "number of sample steps (default: 20)",
+         &params.sample_params.sample_steps},
+        {"",
+         "--high-noise-steps",
+         "(high noise) number of sample steps (default: -1 = auto)",
+         &params.high_noise_sample_params.sample_steps},
+        {"",
+         "--clip-skip",
+         "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). "
+         "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x",
+         &params.clip_skip},
+        {"-b",
+         "--batch-count",
+         "batch count",
+         &params.batch_count},
+        {"",
+         "--chroma-t5-mask-pad",
+         "t5 mask pad size of chroma",
+         &params.chroma_t5_mask_pad},
+        {"",
+         "--video-frames",
+         "video frames (default: 1)",
+         &params.video_frames},
+        {"",
+         "--fps",
+         "fps (default: 24)",
+         &params.fps},
+        {"",
+         "--timestep-shift",
+         "shift timestep for NitroFusion models (default: 0). "
+         "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
+         &params.sample_params.shifted_timestep},
     };
 
     options.float_options = {
-        {"", "--cfg-scale", "", &params.sample_params.guidance.txt_cfg},
-        {"", "--img-cfg-scale", "", &params.sample_params.guidance.img_cfg},
-        {"", "--guidance", "", &params.sample_params.guidance.distilled_guidance},
-        {"", "--slg-scale", "", &params.sample_params.guidance.slg.scale},
-        {"", "--skip-layer-start", "", &params.sample_params.guidance.slg.layer_start},
-        {"", "--skip-layer-end", "", &params.sample_params.guidance.slg.layer_end},
-        {"", "--eta", "", &params.sample_params.eta},
-        {"", "--high-noise-cfg-scale", "", &params.high_noise_sample_params.guidance.txt_cfg},
-        {"", "--high-noise-img-cfg-scale", "", &params.high_noise_sample_params.guidance.img_cfg},
-        {"", "--high-noise-guidance", "", &params.high_noise_sample_params.guidance.distilled_guidance},
-        {"", "--high-noise-slg-scale", "", &params.high_noise_sample_params.guidance.slg.scale},
-        {"", "--high-noise-skip-layer-start", "", &params.high_noise_sample_params.guidance.slg.layer_start},
-        {"", "--high-noise-skip-layer-end", "", &params.high_noise_sample_params.guidance.slg.layer_end},
-        {"", "--high-noise-eta", "", &params.high_noise_sample_params.eta},
-        {"", "--strength", "", &params.strength},
-        {"", "--pm-style-strength", "", &params.pm_style_strength},
-        {"", "--control-strength", "", &params.control_strength},
-        {"", "--moe-boundary", "", &params.moe_boundary},
-        {"", "--flow-shift", "", &params.flow_shift},
-        {"", "--vace-strength", "", &params.vace_strength},
-        {"", "--vae-tile-overlap", "", &params.vae_tiling_params.target_overlap},
+        {"",
+         "--cfg-scale",
+         "unconditional guidance scale: (default: 7.0)",
+         &params.sample_params.guidance.txt_cfg},
+        {"",
+         "--img-cfg-scale",
+         "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)",
+         &params.sample_params.guidance.img_cfg},
+        {"",
+         "--guidance",
+         "distilled guidance scale for models with guidance input (default: 3.5)",
+         &params.sample_params.guidance.distilled_guidance},
+        {"",
+         "--slg-scale",
+         "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium",
+         &params.sample_params.guidance.slg.scale},
+        {"",
+         "--skip-layer-start",
+         "SLG enabling point (default: 0.01)",
+         &params.sample_params.guidance.slg.layer_start},
+        {"",
+         "--skip-layer-end",
+         "SLG disabling point (default: 0.2)",
+         &params.sample_params.guidance.slg.layer_end},
+        {"",
+         "--eta",
+         "eta in DDIM, only for DDIM and TCD (default: 0)",
+         &params.sample_params.eta},
+        {"",
+         "--high-noise-cfg-scale",
+         "(high noise) unconditional guidance scale: (default: 7.0)",
+         &params.high_noise_sample_params.guidance.txt_cfg},
+        {"",
+         "--high-noise-img-cfg-scale",
+         "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)",
+         &params.high_noise_sample_params.guidance.img_cfg},
+        {"",
+         "--high-noise-guidance",
+         "(high noise) distilled guidance scale for models with guidance input (default: 3.5)",
+         &params.high_noise_sample_params.guidance.distilled_guidance},
+        {"",
+         "--high-noise-slg-scale",
+         "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)",
+         &params.high_noise_sample_params.guidance.slg.scale},
+        {"",
+         "--high-noise-skip-layer-start",
+         "(high noise) SLG enabling point (default: 0.01)",
+         &params.high_noise_sample_params.guidance.slg.layer_start},
+        {"",
+         "--high-noise-skip-layer-end",
+         "(high noise) SLG disabling point (default: 0.2)",
+         &params.high_noise_sample_params.guidance.slg.layer_end},
+        {"",
+         "--high-noise-eta",
+         "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)",
+         &params.high_noise_sample_params.eta},
+        {"",
+         "--strength",
+         "strength for noising/unnoising (default: 0.75)",
+         &params.strength},
+        {"",
+         "--pm-style-strength",
+         "",
+         &params.pm_style_strength},
+        {"",
+         "--control-strength",
+         "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
+         &params.control_strength},
+        {"",
+         "--moe-boundary",
+         "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1",
+         &params.moe_boundary},
+        {"",
+         "--flow-shift",
+         "shift value for Flow models like SD3.x or WAN (default: auto)",
+         &params.flow_shift},
+        {"",
+         "--vace-strength",
+         "wan vace strength",
+         &params.vace_strength},
+        {"",
+         "--vae-tile-overlap",
+         "tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
+         &params.vae_tiling_params.target_overlap},
     };
 
     options.bool_options = {
-        {"", "--vae-tiling", "", true, &params.vae_tiling_params.enabled},
-        {"", "--force-sdxl-vae-conv-scale", "", true, &params.force_sdxl_vae_conv_scale},
-        {"", "--offload-to-cpu", "", true, &params.offload_params_to_cpu},
-        {"", "--control-net-cpu", "", true, &params.control_net_cpu},
-        {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
-        {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
-        {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
-        {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
-        {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
-        {"", "--canny", "", true, &params.canny_preprocess},
-        {"-v", "--verbose", "", true, &params.verbose},
-        {"", "--color", "", true, &params.color},
-        {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
-        {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
-        {"", "--increase-ref-index", "", true, &params.increase_ref_index},
+        {"",
+         "--vae-tiling",
+         "process vae in tiles to reduce memory usage",
+         true, &params.vae_tiling_params.enabled},
+        {"",
+         "--force-sdxl-vae-conv-scale",
+         "force use of conv scale on sdxl vae",
+         true, &params.force_sdxl_vae_conv_scale},
+        {"",
+         "--offload-to-cpu",
+         "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed",
+         true, &params.offload_params_to_cpu},
+        {"",
+         "--control-net-cpu",
+         "keep controlnet in cpu (for low vram)",
+         true, &params.control_net_cpu},
+        {"",
+         "--clip-on-cpu",
+         "keep clip in cpu (for low vram)",
+         true, &params.clip_on_cpu},
+        {"",
+         "--vae-on-cpu",
+         "keep vae in cpu (for low vram)",
+         true, &params.vae_on_cpu},
+        {"",
+         "--diffusion-fa",
+         "use flash attention in the diffusion model",
+         true, &params.diffusion_flash_attn},
+        {"",
+         "--diffusion-conv-direct",
+         "use ggml_conv2d_direct in the diffusion model",
+         true, &params.diffusion_conv_direct},
+        {"",
+         "--vae-conv-direct",
+         "use ggml_conv2d_direct in the vae model",
+         true, &params.vae_conv_direct},
+        {"",
+         "--canny",
+         "apply canny preprocessor (edge detection)",
+         true, &params.canny_preprocess},
+        {"-v",
+         "--verbose",
+         "print extra info",
+         true, &params.verbose},
+        {"",
+         "--color",
+         "colors the logging tags according to level",
+         true, &params.color},
+        {"",
+         "--chroma-disable-dit-mask",
+         "disable dit mask for chroma",
+         false, &params.chroma_use_dit_mask},
+        {"",
+         "--chroma-enable-t5-mask",
+         "enable t5 mask for chroma",
+         true, &params.chroma_use_t5_mask},
+        {"",
+         "--increase-ref-index",
+         "automatically increase the indices of references images based on the order they are listed (starting with 1).",
+         true, &params.increase_ref_index},
+        {"",
+         "--disable-auto-resize-ref-image",
+         "disable auto resize of ref images",
+         false, &params.auto_resize_ref_image},
     };
 
     auto on_mode_arg = [&](int argc, const char** argv, int index) {
@@ -586,7 +814,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             return -1;
         }
         const char* mode = argv[index];
-        if (mode != NULL) {
+        if (mode != nullptr) {
             int mode_found = -1;
             for (int i = 0; i < MODE_COUNT; i++) {
                 if (!strcmp(mode, modes_str[i])) {
@@ -711,7 +939,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
     };
 
     auto on_help_arg = [&](int argc, const char** argv, int index) {
-        print_usage(argc, argv);
+        print_usage(argc, argv, options);
         exit(0);
         return 0;
     };
@@ -825,25 +1053,73 @@ void parse_args(int argc, const char** argv, SDParams& params) {
     };
 
     options.manual_options = {
-        {"-M", "--mode", "", on_mode_arg},
-        {"", "--type", "", on_type_arg},
-        {"", "--rng", "", on_rng_arg},
-        {"-s", "--seed", "", on_seed_arg},
-        {"", "--sampling-method", "", on_sample_method_arg},
-        {"", "--prediction", "", on_prediction_arg},
-        {"", "--scheduler", "", on_schedule_arg},
-        {"", "--skip-layers", "", on_skip_layers_arg},
-        {"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg},
-        {"", "--high-noise-scheduler", "", on_high_noise_schedule_arg},
-        {"", "--high-noise-skip-layers", "", on_high_noise_skip_layers_arg},
-        {"-r", "--ref-image", "", on_ref_image_arg},
-        {"-h", "--help", "", on_help_arg},
-        {"", "--vae-tile-size", "", on_tile_size_arg},
-        {"", "--vae-relative-tile-size", "", on_relative_tile_size_arg},
+        {"-M",
+         "--mode",
+         "run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen",
+         on_mode_arg},
+        {"",
+         "--type",
+         "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). "
+         "If not specified, the default is the type of the weight file",
+         on_type_arg},
+        {"",
+         "--rng",
+         "RNG, one of [std_default, cuda], default: cuda",
+         on_rng_arg},
+        {"-s",
+         "--seed",
+         "RNG seed (default: 42, use random seed for < 0)",
+         on_seed_arg},
+        {"",
+         "--sampling-method",
+         "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] "
+         "(default: euler for Flux/SD3/Wan, euler_a otherwise)",
+         on_sample_method_arg},
+        {"",
+         "--prediction",
+         "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]",
+         on_prediction_arg},
+        {"",
+         "--scheduler",
+         "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
+         on_schedule_arg},
+        {"",
+         "--skip-layers",
+         "layers to skip for SLG steps (default: [7,8,9])",
+         on_skip_layers_arg},
+        {"",
+         "--high-noise-sampling-method",
+         "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]"
+         " default: euler for Flux/SD3/Wan, euler_a otherwise",
+         on_high_noise_sample_method_arg},
+        {"",
+         "--high-noise-scheduler",
+         "(high noise) denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
+         on_high_noise_schedule_arg},
+        {"",
+         "--high-noise-skip-layers",
+         "(high noise) layers to skip for SLG steps (default: [7,8,9])",
+         on_high_noise_skip_layers_arg},
+        {"-r",
+         "--ref-image",
+         "reference image for Flux Kontext models (can be used multiple times)",
+         on_ref_image_arg},
+        {"-h",
+         "--help",
+         "show this help message and exit",
+         on_help_arg},
+        {"",
+         "--vae-tile-size",
+         "tile size for vae tiling, format [X]x[Y] (default: 32x32)",
+         on_tile_size_arg},
+        {"",
+         "--vae-relative-tile-size",
+         "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
+         on_relative_tile_size_arg},
     };
 
     if (!parse_options(argc, argv, options)) {
-        print_usage(argc, argv);
+        print_usage(argc, argv, options);
         exit(1);
     }
 
@@ -853,19 +1129,19 @@ void parse_args(int argc, const char** argv, SDParams& params) {
 
     if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: prompt\n");
-        print_usage(argc, argv);
+        print_usage(argc, argv, options);
         exit(1);
     }
 
     if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
-        print_usage(argc, argv);
+        print_usage(argc, argv, options);
         exit(1);
     }
 
     if (params.output_path.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: output_path\n");
-        print_usage(argc, argv);
+        print_usage(argc, argv, options);
         exit(1);
     }
 
@@ -917,6 +1193,11 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
+    if (params.upscale_tile_size < 1) {
+        fprintf(stderr, "error: upscale tile size must be at least 1\n");
+        exit(1);
+    }
+
     if (params.mode == UPSCALE) {
         if (params.esrgan_path.length() == 0) {
             fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n");
@@ -929,7 +1210,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
     }
 
     if (params.seed < 0) {
-        srand((int)time(NULL));
+        srand((int)time(nullptr));
         params.seed = rand();
     }
 
@@ -1044,9 +1325,9 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
 uint8_t* load_image(const char* image_path, int& width, int& height, int expected_width = 0, int expected_height = 0, int expected_channel = 3) {
     int c                 = 0;
     uint8_t* image_buffer = (uint8_t*)stbi_load(image_path, &width, &height, &c, expected_channel);
-    if (image_buffer == NULL) {
+    if (image_buffer == nullptr) {
         fprintf(stderr, "load image from '%s' failed\n", image_path);
-        return NULL;
+        return nullptr;
     }
     if (c < expected_channel) {
         fprintf(stderr,
@@ -1056,17 +1337,17 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte
                 c,
                 image_path);
         free(image_buffer);
-        return NULL;
+        return nullptr;
     }
     if (width <= 0) {
         fprintf(stderr, "error: the width of image must be greater than 0, image_path = %s\n", image_path);
         free(image_buffer);
-        return NULL;
+        return nullptr;
     }
     if (height <= 0) {
         fprintf(stderr, "error: the height of image must be greater than 0, image_path = %s\n", image_path);
         free(image_buffer);
-        return NULL;
+        return nullptr;
     }
 
     // Resize input image ...
@@ -1088,10 +1369,10 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte
         if (crop_x != 0 || crop_y != 0) {
             printf("crop input image from %dx%d to %dx%d, image_path = %s\n", width, height, crop_w, crop_h, image_path);
             uint8_t* cropped_image_buffer = (uint8_t*)malloc(crop_w * crop_h * expected_channel);
-            if (cropped_image_buffer == NULL) {
+            if (cropped_image_buffer == nullptr) {
                 fprintf(stderr, "error: allocate memory for crop\n");
                 free(image_buffer);
-                return NULL;
+                return nullptr;
             }
             for (int row = 0; row < crop_h; row++) {
                 uint8_t* src = image_buffer + ((crop_y + row) * width + crop_x) * expected_channel;
@@ -1110,10 +1391,10 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte
         int resized_width  = expected_width;
 
         uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * expected_channel);
-        if (resized_image_buffer == NULL) {
+        if (resized_image_buffer == nullptr) {
             fprintf(stderr, "error: allocate memory for resize input image\n");
             free(image_buffer);
-            return NULL;
+            return nullptr;
         }
         stbir_resize(image_buffer, width, height, 0,
                      resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8,
@@ -1164,7 +1445,7 @@ bool load_images_from_dir(const std::string dir,
             int width             = 0;
             int height            = 0;
             uint8_t* image_buffer = load_image(path.c_str(), width, height, expected_width, expected_height);
-            if (image_buffer == NULL) {
+            if (image_buffer == nullptr) {
                 fprintf(stderr, "load image from '%s' failed\n", path.c_str());
                 return false;
             }
@@ -1216,10 +1497,10 @@ int main(int argc, const char* argv[]) {
     }
 
     bool vae_decode_only     = true;
-    sd_image_t init_image    = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL};
-    sd_image_t end_image     = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL};
-    sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, NULL};
-    sd_image_t mask_image    = {(uint32_t)params.width, (uint32_t)params.height, 1, NULL};
+    sd_image_t init_image    = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr};
+    sd_image_t end_image     = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr};
+    sd_image_t control_image = {(uint32_t)params.width, (uint32_t)params.height, 3, nullptr};
+    sd_image_t mask_image    = {(uint32_t)params.width, (uint32_t)params.height, 1, nullptr};
     std::vector<sd_image_t> ref_images;
     std::vector<sd_image_t> pmid_images;
     std::vector<sd_image_t> control_frames;
@@ -1231,17 +1512,17 @@ int main(int argc, const char* argv[]) {
         free(mask_image.data);
         for (auto image : ref_images) {
             free(image.data);
-            image.data = NULL;
+            image.data = nullptr;
         }
         ref_images.clear();
         for (auto image : pmid_images) {
             free(image.data);
-            image.data = NULL;
+            image.data = nullptr;
         }
         pmid_images.clear();
         for (auto image : control_frames) {
             free(image.data);
-            image.data = NULL;
+            image.data = nullptr;
         }
         control_frames.clear();
     };
@@ -1252,7 +1533,7 @@ int main(int argc, const char* argv[]) {
         int width       = 0;
         int height      = 0;
         init_image.data = load_image(params.init_image_path.c_str(), width, height, params.width, params.height);
-        if (init_image.data == NULL) {
+        if (init_image.data == nullptr) {
             fprintf(stderr, "load image from '%s' failed\n", params.init_image_path.c_str());
             release_all_resources();
             return 1;
@@ -1265,7 +1546,7 @@ int main(int argc, const char* argv[]) {
         int width      = 0;
         int height     = 0;
         end_image.data = load_image(params.end_image_path.c_str(), width, height, params.width, params.height);
-        if (end_image.data == NULL) {
+        if (end_image.data == nullptr) {
             fprintf(stderr, "load image from '%s' failed\n", params.end_image_path.c_str());
             release_all_resources();
             return 1;
@@ -1277,7 +1558,7 @@ int main(int argc, const char* argv[]) {
         int width       = 0;
         int height      = 0;
         mask_image.data = load_image(params.mask_image_path.c_str(), width, height, params.width, params.height, 1);
-        if (mask_image.data == NULL) {
+        if (mask_image.data == nullptr) {
             fprintf(stderr, "load image from '%s' failed\n", params.mask_image_path.c_str());
             release_all_resources();
             return 1;
@@ -1285,7 +1566,7 @@ int main(int argc, const char* argv[]) {
     } else {
         mask_image.data = (uint8_t*)malloc(params.width * params.height);
         memset(mask_image.data, 255, params.width * params.height);
-        if (mask_image.data == NULL) {
+        if (mask_image.data == nullptr) {
             fprintf(stderr, "malloc mask image failed\n");
             release_all_resources();
             return 1;
@@ -1296,7 +1577,7 @@ int main(int argc, const char* argv[]) {
         int width          = 0;
         int height         = 0;
         control_image.data = load_image(params.control_image_path.c_str(), width, height, params.width, params.height);
-        if (control_image.data == NULL) {
+        if (control_image.data == nullptr) {
             fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
             release_all_resources();
             return 1;
@@ -1317,7 +1598,7 @@ int main(int argc, const char* argv[]) {
             int width             = 0;
             int height            = 0;
             uint8_t* image_buffer = load_image(path.c_str(), width, height);
-            if (image_buffer == NULL) {
+            if (image_buffer == nullptr) {
                 fprintf(stderr, "load image from '%s' failed\n", path.c_str());
                 release_all_resources();
                 return 1;
@@ -1399,18 +1680,18 @@ int main(int argc, const char* argv[]) {
     if (params.mode == UPSCALE) {
         num_results = 1;
         results     = (sd_image_t*)calloc(num_results, sizeof(sd_image_t));
-        if (results == NULL) {
+        if (results == nullptr) {
             printf("failed to allocate results array\n");
             release_all_resources();
             return 1;
         }
 
         results[0]      = init_image;
-        init_image.data = NULL;
+        init_image.data = nullptr;
     } else {
         sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
 
-        if (sd_ctx == NULL) {
+        if (sd_ctx == nullptr) {
             printf("new_sd_ctx_t failed\n");
             release_all_resources();
             return 1;
@@ -1428,6 +1709,7 @@ int main(int argc, const char* argv[]) {
                 init_image,
                 ref_images.data(),
                 (int)ref_images.size(),
+                params.auto_resize_ref_image,
                 params.increase_ref_index,
                 mask_image,
                 params.width,
@@ -1472,7 +1754,7 @@ int main(int argc, const char* argv[]) {
             results = generate_video(sd_ctx, &vid_gen_params, &num_results);
         }
 
-        if (results == NULL) {
+        if (results == nullptr) {
             printf("generate failed\n");
             free_sd_ctx(sd_ctx);
             return 1;
@@ -1486,19 +1768,20 @@ int main(int argc, const char* argv[]) {
         upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                         params.offload_params_to_cpu,
                                                         params.diffusion_conv_direct,
-                                                        params.n_threads);
+                                                        params.n_threads,
+                                                        params.upscale_tile_size);
 
-        if (upscaler_ctx == NULL) {
+        if (upscaler_ctx == nullptr) {
             printf("new_upscaler_ctx failed\n");
         } else {
             for (int i = 0; i < num_results; i++) {
-                if (results[i].data == NULL) {
+                if (results[i].data == nullptr) {
                     continue;
                 }
                 sd_image_t current_image = results[i];
                 for (int u = 0; u < params.upscale_repeats; ++u) {
                     sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
-                    if (upscaled_image.data == NULL) {
+                    if (upscaled_image.data == nullptr) {
                         printf("upscale failed\n");
                         break;
                     }
@@ -1556,7 +1839,7 @@ int main(int argc, const char* argv[]) {
             file_ext = ".png";
         }
         for (int i = 0; i < num_results; i++) {
-            if (results[i].data == NULL) {
+            if (results[i].data == nullptr) {
                 continue;
             }
             std::string final_image_path = i > 0 ? base_path + "_" + std::to_string(i + 1) + file_ext : base_path + file_ext;
@@ -1574,7 +1857,7 @@ int main(int argc, const char* argv[]) {
 
     for (int i = 0; i < num_results; i++) {
         free(results[i].data);
-        results[i].data = NULL;
+        results[i].data = nullptr;
     }
     free(results);
 
diff --git a/flux.hpp b/flux.hpp
index 2ed41041..355184be 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -1,6 +1,7 @@
 #ifndef __FLUX_HPP__
 #define __FLUX_HPP__
 
+#include <memory>
 #include <vector>
 
 #include "ggml_extend.hpp"
@@ -18,7 +19,7 @@ namespace Flux {
             blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, true));
         }
 
-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
             // x: [..., in_dim]
             // return: [..., hidden_dim]
             auto in_layer  = std::dynamic_pointer_cast<Linear>(blocks["in_layer"]);
@@ -36,7 +37,7 @@ namespace Flux {
         int64_t hidden_size;
         float eps;
 
-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
             ggml_type wtype = GGML_TYPE_F32;
             params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
         }
@@ -47,7 +48,7 @@ namespace Flux {
             : hidden_size(hidden_size),
               eps(eps) {}
 
-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
             struct ggml_tensor* w = params["scale"];
             x                     = ggml_rms_norm(ctx, x, eps);
             x                     = ggml_mul(ctx, x, w);
@@ -136,11 +137,11 @@ namespace Flux {
     };
 
     struct ModulationOut {
-        ggml_tensor* shift = NULL;
-        ggml_tensor* scale = NULL;
-        ggml_tensor* gate  = NULL;
+        ggml_tensor* shift = nullptr;
+        ggml_tensor* scale = nullptr;
+        ggml_tensor* gate  = nullptr;
 
-        ModulationOut(ggml_tensor* shift = NULL, ggml_tensor* scale = NULL, ggml_tensor* gate = NULL)
+        ModulationOut(ggml_tensor* shift = nullptr, ggml_tensor* scale = nullptr, ggml_tensor* gate = nullptr)
             : shift(shift), scale(scale), gate(gate) {}
 
         ModulationOut(struct ggml_context* ctx, ggml_tensor* vec, int64_t offset) {
@@ -259,7 +260,7 @@ namespace Flux {
                                                                     struct ggml_tensor* txt,
                                                                     struct ggml_tensor* vec,
                                                                     struct ggml_tensor* pe,
-                                                                    struct ggml_tensor* mask = NULL) {
+                                                                    struct ggml_tensor* mask = nullptr) {
             // img: [N, n_img_token, hidden_size]
             // txt: [N, n_txt_token, hidden_size]
             // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@@ -398,7 +399,7 @@ namespace Flux {
 
         ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
             int64_t offset = 3 * idx;
-            return ModulationOut(ctx, vec, offset);
+            return {ctx, vec, offset};
         }
 
         struct ggml_tensor* forward(struct ggml_context* ctx,
@@ -406,7 +407,7 @@ namespace Flux {
                                     struct ggml_tensor* x,
                                     struct ggml_tensor* vec,
                                     struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask = NULL) {
+                                    struct ggml_tensor* mask = nullptr) {
             // x: [N, n_token, hidden_size]
             // pe: [n_token, d_head/2, 2, 2]
             // return: [N, n_token, hidden_size]
@@ -485,7 +486,7 @@ namespace Flux {
             auto shift     = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0));  // [N, dim]
             auto scale     = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1));  // [N, dim]
             // No gate
-            return ModulationOut(shift, scale, NULL);
+            return {shift, scale, nullptr};
         }
 
         struct ggml_tensor* forward(struct ggml_context* ctx,
@@ -664,7 +665,7 @@ namespace Flux {
                                          struct ggml_tensor* y,
                                          struct ggml_tensor* guidance,
                                          struct ggml_tensor* pe,
-                                         struct ggml_tensor* mod_index_arange = NULL,
+                                         struct ggml_tensor* mod_index_arange = nullptr,
                                          std::vector<int> skip_layers         = {}) {
             auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
             auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
@@ -672,7 +673,7 @@ namespace Flux {
 
             img = img_in->forward(ctx, img);
             struct ggml_tensor* vec;
-            struct ggml_tensor* txt_img_mask = NULL;
+            struct ggml_tensor* txt_img_mask = nullptr;
             if (params.is_chroma) {
                 int64_t mod_index_length = 344;
                 auto approx              = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
@@ -681,7 +682,7 @@ namespace Flux {
 
                 // auto mod_index_arange  = ggml_arange(ctx, 0, (float)mod_index_length, 1);
                 // ggml_arange tot working on a lot of backends, precomputing it on CPU instead
-                GGML_ASSERT(arange != NULL);
+                GGML_ASSERT(arange != nullptr);
                 auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f);  // [1, 344, 32]
 
                 // Batch broadcast (will it ever be useful)
@@ -695,7 +696,7 @@ namespace Flux {
                 vec = ggml_cont(ctx, ggml_permute(ctx, vec, 0, 2, 1, 3));  // [344, N, 64]
                 vec = approx->forward(ctx, vec);                           // [344, N, hidden_size]
 
-                if (y != NULL) {
+                if (y != nullptr) {
                     txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0);
                 }
             } else {
@@ -703,7 +704,7 @@ namespace Flux {
                 auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
                 vec            = time_in->forward(ctx, ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f));
                 if (params.guidance_embed) {
-                    GGML_ASSERT(guidance != NULL);
+                    GGML_ASSERT(guidance != nullptr);
                     auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
                     // bf16 and fp16 result is different
                     auto g_in = ggml_nn_timestep_embedding(ctx, guidance, 256, 10000, 1000.f);
@@ -775,14 +776,14 @@ namespace Flux {
                                     struct ggml_tensor* y,
                                     struct ggml_tensor* guidance,
                                     struct ggml_tensor* pe,
-                                    struct ggml_tensor* mod_index_arange  = NULL,
+                                    struct ggml_tensor* mod_index_arange  = nullptr,
                                     std::vector<ggml_tensor*> ref_latents = {},
                                     std::vector<int> skip_layers          = {}) {
             // Forward pass of DiT.
             // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
             // timestep: (N,) tensor of diffusion timesteps
             // context: (N, L, D)
-            // c_concat: NULL, or for (N,C+M, H, W) for Fill
+            // c_concat: nullptr, or for (N,C+M, H, W) for Fill
             // y: (N, adm_in_channels) tensor of class labels
             // guidance: (N,)
             // pe: (L, d_head/2, 2, 2)
@@ -801,7 +802,7 @@ namespace Flux {
             uint64_t img_tokens = img->ne[1];
 
             if (params.version == VERSION_FLUX_FILL) {
-                GGML_ASSERT(c_concat != NULL);
+                GGML_ASSERT(c_concat != nullptr);
                 ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                 ggml_tensor* mask   = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
 
@@ -810,7 +811,7 @@ namespace Flux {
 
                 img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
             } else if (params.version == VERSION_FLEX_2) {
-                GGML_ASSERT(c_concat != NULL);
+                GGML_ASSERT(c_concat != nullptr);
                 ggml_tensor* masked  = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                 ggml_tensor* mask    = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
                 ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
@@ -825,7 +826,7 @@ namespace Flux {
 
                 img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0);
             } else if (params.version == VERSION_FLUX_CONTROLS) {
-                GGML_ASSERT(c_concat != NULL);
+                GGML_ASSERT(c_concat != nullptr);
 
                 ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
                 control              = patchify(ctx, control, patch_size);
@@ -924,7 +925,7 @@ namespace Flux {
             flux.init(params_ctx, tensor_types, prefix);
         }
 
-        std::string get_desc() {
+        std::string get_desc() override {
             return "flux";
         }
 
@@ -944,18 +945,18 @@ namespace Flux {
             GGML_ASSERT(x->ne[3] == 1);
             struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
 
-            struct ggml_tensor* mod_index_arange = NULL;
+            struct ggml_tensor* mod_index_arange = nullptr;
 
             x       = to_backend(x);
             context = to_backend(context);
-            if (c_concat != NULL) {
+            if (c_concat != nullptr) {
                 c_concat = to_backend(c_concat);
             }
             if (flux_params.is_chroma) {
                 guidance = ggml_set_f32(guidance, 0);
 
                 if (!use_mask) {
-                    y = NULL;
+                    y = nullptr;
                 }
 
                 // ggml_arange is not working on some backends, precompute it
@@ -987,7 +988,7 @@ namespace Flux {
             auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
             // pe->data = pe_vec.data();
             // print_ggml_tensor(pe);
-            // pe->data = NULL;
+            // pe->data = nullptr;
             set_backend_tensor_data(pe, pe_vec.data());
 
             struct ggml_tensor* out = flux.forward(compute_ctx,
@@ -1017,8 +1018,8 @@ namespace Flux {
                      struct ggml_tensor* guidance,
                      std::vector<ggml_tensor*> ref_latents = {},
                      bool increase_ref_index               = false,
-                     struct ggml_tensor** output           = NULL,
-                     struct ggml_context* output_ctx       = NULL,
+                     struct ggml_tensor** output           = nullptr,
+                     struct ggml_context* output_ctx       = nullptr,
                      std::vector<int> skip_layers          = std::vector<int>()) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
@@ -1035,11 +1036,11 @@ namespace Flux {
         void test() {
             struct ggml_init_params params;
             params.mem_size   = static_cast<size_t>(20 * 1024 * 1024);  // 20 MB
-            params.mem_buffer = NULL;
+            params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
             struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != NULL);
+            GGML_ASSERT(work_ctx != nullptr);
 
             {
                 // cpu f16:
@@ -1063,10 +1064,10 @@ namespace Flux {
                 ggml_set_f32(y, 0.01f);
                 // print_ggml_tensor(y);
 
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
 
                 int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx);
+                compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx);
                 int t1 = ggml_time_ms();
 
                 print_ggml_tensor(out);
@@ -1078,7 +1079,7 @@ namespace Flux {
             // ggml_backend_t backend    = ggml_backend_cuda_init(0);
             ggml_backend_t backend           = ggml_backend_cpu_init();
             ggml_type model_data_type        = GGML_TYPE_Q8_0;
-            std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend, false));
+            std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend, false);
             {
                 LOG_INFO("loading from '%s'", file_path.c_str());
 
diff --git a/format-code.sh b/format-code.sh
index 9fdba32e..adad801f 100644
--- a/format-code.sh
+++ b/format-code.sh
@@ -1,5 +1,8 @@
 for f in *.cpp *.h *.hpp examples/cli/*.cpp examples/cli/*.h; do
   [[ "$f" == vocab* ]] && continue
   echo "formatting '$f'"
+  # if [ "$f" != "stable-diffusion.h" ]; then
+  #   clang-tidy -fix -p build_linux/ "$f"
+  # fi
   clang-format -style=file -i "$f"
 done
\ No newline at end of file
diff --git a/ggml b/ggml
index 7bffd79a..c538174d 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 7bffd79a4bec72e9a3bfbedb582a218b84401c13
+Subproject commit c538174d261d8172480f87efcfec8e69aac13ebb
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index d8df0d8f..02d82bc0 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -105,7 +105,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_mul_n_mode(struct ggml_context* ctx,
     return result;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct ggml_tensor* lora_down, struct ggml_tensor* lora_up, struct ggml_tensor* lora_mid = NULL) {
+__STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct ggml_tensor* lora_down, struct ggml_tensor* lora_up, struct ggml_tensor* lora_mid = nullptr) {
     struct ggml_tensor* updown;
     // flat lora tensors to multiply it
     int64_t lora_up_rows  = lora_up->ne[ggml_n_dims(lora_up) - 1];
@@ -118,7 +118,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct
 
     // ggml_mul_mat requires tensor b transposed
     lora_down = ggml_cont(ctx, ggml_transpose(ctx, lora_down));
-    if (lora_mid == NULL) {
+    if (lora_mid == nullptr) {
         updown = ggml_mul_mat(ctx, lora_up, lora_down);
         updown = ggml_cont(ctx, ggml_transpose(ctx, updown));
     } else {
@@ -165,7 +165,7 @@ __STATIC_INLINE__ void ggml_tensor_set_f32(struct ggml_tensor* tensor, float val
 }
 
 __STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
-    if (tensor->buffer != NULL) {
+    if (tensor->buffer != nullptr) {
         float value;
         ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(float));
         return value;
@@ -175,7 +175,7 @@ __STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, in
 }
 
 __STATIC_INLINE__ int ggml_tensor_get_i32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
-    if (tensor->buffer != NULL) {
+    if (tensor->buffer != nullptr) {
         float value;
         ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(int));
         return value;
@@ -292,7 +292,7 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st
     std::ifstream file(file_path, std::ios::binary);
     if (!file.is_open()) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
-        return NULL;
+        return nullptr;
     }
     int32_t n_dims;
     int32_t length;
@@ -306,7 +306,7 @@ __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const st
 
     if (file.eof()) {
         LOG_ERROR("incomplete file '%s'", file_path.c_str());
-        return NULL;
+        return nullptr;
     }
 
     int32_t nelements = 1;
@@ -354,7 +354,7 @@ __STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_ten
     }
     struct ggml_init_params params;
     params.mem_size          = 10 * 1024 * 1024;  // for padding
-    params.mem_buffer        = NULL;
+    params.mem_buffer        = nullptr;
     params.no_alloc          = false;
     struct ggml_context* ctx = ggml_init(params);
     if (!ctx) {
@@ -860,7 +860,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
     params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float);      // input chunk
     params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float);  // output chunk
     params.mem_size += 3 * ggml_tensor_overhead();
-    params.mem_buffer = NULL;
+    params.mem_buffer = nullptr;
     params.no_alloc   = false;
 
     LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
@@ -961,7 +961,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, 1.f / scale);
     }
-    if (b != NULL) {
+    if (b != nullptr) {
         x = ggml_add_inplace(ctx, x, b);
     }
     return x;
@@ -994,7 +994,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, 1.f / scale);
     }
-    if (b != NULL) {
+    if (b != nullptr) {
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
         x = ggml_add_inplace(ctx, x, b);
     }
@@ -1023,7 +1023,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_3d(struct ggml_context* ctx,
     int64_t N  = x->ne[3] / IC;
     x          = ggml_conv_3d(ctx, w, x, IC, s0, s1, s2, p0, p1, p2, d0, d1, d2);
 
-    if (b != NULL) {
+    if (b != nullptr) {
         b = ggml_reshape_4d(ctx, b, 1, 1, 1, b->ne[0]);  // [OC, 1, 1, 1]
         x = ggml_add_inplace(ctx, x, b);
     }
@@ -1042,7 +1042,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_3d_nx1x1(struct ggml_context*
                                                             int p2 = 1,
                                                             int d2 = 1) {
     x = ggml_conv_2d(ctx, w, x, 1, s2, 0, p2, 1, d2);  // [N, OC, T, OH * OW]
-    if (b != NULL) {
+    if (b != nullptr) {
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
         x = ggml_add(ctx, x, b);
     }
@@ -1146,7 +1146,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
                                                             struct ggml_tensor* k,
                                                             struct ggml_tensor* v,
                                                             int64_t n_head,
-                                                            struct ggml_tensor* mask = NULL,
+                                                            struct ggml_tensor* mask = nullptr,
                                                             bool diag_mask_inf       = false,
                                                             bool skip_reshape        = false,
                                                             bool flash_attn          = false,  // avoid overflow
@@ -1293,9 +1293,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_layer_norm(struct ggml_context* ct
                                                          struct ggml_tensor* b,
                                                          float eps = EPS) {
     x = ggml_norm(ctx, x, eps);
-    if (w != NULL) {
+    if (w != nullptr) {
         x = ggml_mul_inplace(ctx, x, w);
-        if (b != NULL) {
+        if (b != nullptr) {
             x = ggml_add_inplace(ctx, x, b);
         }
     }
@@ -1307,14 +1307,14 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ct
                                                          struct ggml_tensor* w,
                                                          struct ggml_tensor* b,
                                                          int num_groups = 32) {
-    if (ggml_n_dims(x) >= 3 && w != NULL && b != NULL) {
+    if (ggml_n_dims(x) >= 3 && w != nullptr && b != nullptr) {
         w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1);
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
     }
 
     const float eps = 1e-6f;  // default eps parameter
     x               = ggml_group_norm(ctx, x, num_groups, eps);
-    if (w != NULL && b != NULL) {
+    if (w != nullptr && b != nullptr) {
         x = ggml_mul_inplace(ctx, x, w);
         // b = ggml_repeat(ctx, b, x);
         x = ggml_add_inplace(ctx, x, b);
@@ -1422,7 +1422,7 @@ __STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context
     // embedding: [N, dim]
     std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
     struct ggml_tensor* embedding    = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size());
-    if (embedding->data != NULL) {
+    if (embedding->data != nullptr) {
         memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding));
     } else {
         ggml_backend_tensor_set(embedding, embedding_vec.data(), 0, ggml_nbytes(embedding));
@@ -1458,23 +1458,23 @@ struct GGMLRunner {
 protected:
     typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
 
-    ggml_backend_t params_backend  = NULL;
-    ggml_backend_t runtime_backend = NULL;
+    ggml_backend_t params_backend  = nullptr;
+    ggml_backend_t runtime_backend = nullptr;
 
-    struct ggml_context* params_ctx             = NULL;
-    ggml_backend_buffer_t params_buffer         = NULL;
-    struct ggml_context* offload_ctx            = NULL;
-    ggml_backend_buffer_t runtime_params_buffer = NULL;
+    struct ggml_context* params_ctx             = nullptr;
+    ggml_backend_buffer_t params_buffer         = nullptr;
+    struct ggml_context* offload_ctx            = nullptr;
+    ggml_backend_buffer_t runtime_params_buffer = nullptr;
     bool params_on_runtime_backend              = false;
 
-    struct ggml_context* cache_ctx     = NULL;
-    ggml_backend_buffer_t cache_buffer = NULL;
+    struct ggml_context* cache_ctx     = nullptr;
+    ggml_backend_buffer_t cache_buffer = nullptr;
 
-    struct ggml_context* compute_ctx    = NULL;
-    struct ggml_gallocr* compute_allocr = NULL;
+    struct ggml_context* compute_ctx    = nullptr;
+    struct ggml_gallocr* compute_allocr = nullptr;
 
     std::vector<float> one_vec = {1.f};
-    ggml_tensor* one_tensor    = NULL;
+    ggml_tensor* one_tensor    = nullptr;
 
     std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
     std::map<std::string, struct ggml_tensor*> cache_tensor_map;  // name -> tensor
@@ -1483,59 +1483,59 @@ struct GGMLRunner {
     void alloc_params_ctx() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = true;
 
         params_ctx = ggml_init(params);
-        GGML_ASSERT(params_ctx != NULL);
+        GGML_ASSERT(params_ctx != nullptr);
         if (params_backend != runtime_backend) {
             offload_ctx = ggml_init(params);
-            GGML_ASSERT(offload_ctx != NULL);
+            GGML_ASSERT(offload_ctx != nullptr);
         }
     }
 
     void free_params_ctx() {
-        if (params_ctx != NULL) {
+        if (params_ctx != nullptr) {
             ggml_free(params_ctx);
-            params_ctx = NULL;
+            params_ctx = nullptr;
         }
-        if (offload_ctx != NULL) {
+        if (offload_ctx != nullptr) {
             ggml_free(offload_ctx);
-            offload_ctx = NULL;
+            offload_ctx = nullptr;
         }
     }
 
     void alloc_cache_ctx() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = true;
 
         cache_ctx = ggml_init(params);
-        GGML_ASSERT(cache_ctx != NULL);
+        GGML_ASSERT(cache_ctx != nullptr);
     }
 
     void free_cache_ctx() {
-        if (cache_ctx != NULL) {
+        if (cache_ctx != nullptr) {
             ggml_free(cache_ctx);
-            cache_ctx = NULL;
+            cache_ctx = nullptr;
         }
     }
 
     void alloc_compute_ctx() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead());
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = true;
 
         compute_ctx = ggml_init(params);
-        GGML_ASSERT(compute_ctx != NULL);
+        GGML_ASSERT(compute_ctx != nullptr);
     }
 
     void free_compute_ctx() {
-        if (compute_ctx != NULL) {
+        if (compute_ctx != nullptr) {
             ggml_free(compute_ctx);
-            compute_ctx = NULL;
+            compute_ctx = nullptr;
         }
     }
 
@@ -1559,7 +1559,7 @@ struct GGMLRunner {
     }
 
     bool alloc_compute_buffer(get_graph_cb_t get_graph) {
-        if (compute_allocr != NULL) {
+        if (compute_allocr != nullptr) {
             return true;
         }
         reset_compute_ctx();
@@ -1584,9 +1584,9 @@ struct GGMLRunner {
     }
 
     void free_cache_buffer() {
-        if (cache_buffer != NULL) {
+        if (cache_buffer != nullptr) {
             ggml_backend_buffer_free(cache_buffer);
-            cache_buffer = NULL;
+            cache_buffer = nullptr;
         }
     }
 
@@ -1596,7 +1596,7 @@ struct GGMLRunner {
         }
         free_cache_ctx_and_buffer();
         alloc_cache_ctx();
-        GGML_ASSERT(cache_buffer == NULL);
+        GGML_ASSERT(cache_buffer == nullptr);
         std::map<ggml_tensor*, ggml_tensor*> runtime_tensor_to_cache_tensor;
         for (auto kv : cache_tensor_map) {
             auto cache_tensor = ggml_dup_tensor(cache_ctx, kv.second);
@@ -1605,7 +1605,7 @@ struct GGMLRunner {
         }
         size_t num_tensors = ggml_tensor_num(cache_ctx);
         cache_buffer       = ggml_backend_alloc_ctx_tensors(cache_ctx, runtime_backend);
-        GGML_ASSERT(cache_buffer != NULL);
+        GGML_ASSERT(cache_buffer != nullptr);
         for (auto kv : runtime_tensor_to_cache_tensor) {
             ggml_backend_tensor_copy(kv.first, kv.second);
         }
@@ -1637,12 +1637,12 @@ struct GGMLRunner {
         if (params_on_runtime_backend) {
             return true;
         }
-        GGML_ASSERT(runtime_params_buffer == NULL);
+        GGML_ASSERT(runtime_params_buffer == nullptr);
         int64_t t0         = ggml_time_ms();
         size_t num_tensors = ggml_tensor_num(offload_ctx);
         if (num_tensors == 0) {
-            for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
-                GGML_ASSERT(t->view_src == NULL);
+            for (ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != nullptr; t = ggml_get_next_tensor(params_ctx, t)) {
+                GGML_ASSERT(t->view_src == nullptr);
                 ggml_dup_tensor(offload_ctx, t);
             }
         }
@@ -1651,7 +1651,7 @@ struct GGMLRunner {
 
         runtime_params_buffer = ggml_backend_alloc_ctx_tensors(offload_ctx, runtime_backend);
 
-        if (runtime_params_buffer == NULL) {
+        if (runtime_params_buffer == nullptr) {
             LOG_ERROR("%s alloc runtime params backend buffer failed, num_tensors = %i",
                       get_desc().c_str(),
                       num_tensors);
@@ -1661,7 +1661,7 @@ struct GGMLRunner {
         ggml_tensor* t         = ggml_get_first_tensor(params_ctx);
         ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
 
-        while (t != NULL && offload_t != NULL) {
+        while (t != nullptr && offload_t != nullptr) {
             ggml_backend_tensor_copy(t, offload_t);
             std::swap(t->buffer, offload_t->buffer);
             std::swap(t->data, offload_t->data);
@@ -1693,21 +1693,21 @@ struct GGMLRunner {
         ggml_tensor* t         = ggml_get_first_tensor(params_ctx);
         ggml_tensor* offload_t = ggml_get_first_tensor(offload_ctx);
 
-        while (t != NULL && offload_t != NULL) {
+        while (t != nullptr && offload_t != nullptr) {
             t->buffer         = offload_t->buffer;
             t->data           = offload_t->data;
             t->extra          = offload_t->extra;
-            offload_t->buffer = NULL;
-            offload_t->data   = NULL;
-            offload_t->extra  = NULL;
+            offload_t->buffer = nullptr;
+            offload_t->data   = nullptr;
+            offload_t->extra  = nullptr;
 
             t         = ggml_get_next_tensor(params_ctx, t);
             offload_t = ggml_get_next_tensor(offload_ctx, offload_t);
         }
 
-        if (runtime_params_buffer != NULL) {
+        if (runtime_params_buffer != nullptr) {
             ggml_backend_buffer_free(runtime_params_buffer);
-            runtime_params_buffer = NULL;
+            runtime_params_buffer = nullptr;
         }
         params_on_runtime_backend = false;
     }
@@ -1744,7 +1744,7 @@ struct GGMLRunner {
     bool alloc_params_buffer() {
         size_t num_tensors = ggml_tensor_num(params_ctx);
         params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, params_backend);
-        if (params_buffer == NULL) {
+        if (params_buffer == nullptr) {
             LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
                       get_desc().c_str(),
                       num_tensors);
@@ -1760,14 +1760,14 @@ struct GGMLRunner {
     }
 
     void free_params_buffer() {
-        if (params_buffer != NULL) {
+        if (params_buffer != nullptr) {
             ggml_backend_buffer_free(params_buffer);
-            params_buffer = NULL;
+            params_buffer = nullptr;
         }
     }
 
     size_t get_params_buffer_size() {
-        if (params_buffer != NULL) {
+        if (params_buffer != nullptr) {
             return ggml_backend_buffer_get_size(params_buffer);
         }
         return 0;
@@ -1779,9 +1779,9 @@ struct GGMLRunner {
     }
 
     void free_compute_buffer() {
-        if (compute_allocr != NULL) {
+        if (compute_allocr != nullptr) {
             ggml_gallocr_free(compute_allocr);
-            compute_allocr = NULL;
+            compute_allocr = nullptr;
         }
         offload_params_to_params_backend();
     }
@@ -1792,12 +1792,12 @@ struct GGMLRunner {
     }
 
     struct ggml_tensor* to_backend(struct ggml_tensor* tensor) {
-        GGML_ASSERT(compute_ctx != NULL);
-        if (tensor == NULL) {
-            return NULL;
+        GGML_ASSERT(compute_ctx != nullptr);
+        if (tensor == nullptr) {
+            return nullptr;
         }
         // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
+        if (!ggml_backend_is_cpu(runtime_backend) && (tensor->buffer == nullptr || ggml_backend_buffer_is_host(tensor->buffer))) {
             // pass input tensors to gpu memory
             auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
 
@@ -1813,8 +1813,8 @@ struct GGMLRunner {
     }
 
     struct ggml_tensor* get_cache_tensor_by_name(const std::string& name) {
-        if (cache_ctx == NULL) {
-            return NULL;
+        if (cache_ctx == nullptr) {
+            return nullptr;
         }
         return ggml_get_tensor(cache_ctx, name.c_str());
     }
@@ -1822,8 +1822,8 @@ struct GGMLRunner {
     void compute(get_graph_cb_t get_graph,
                  int n_threads,
                  bool free_compute_buffer_immediately = true,
-                 struct ggml_tensor** output          = NULL,
-                 struct ggml_context* output_ctx      = NULL) {
+                 struct ggml_tensor** output          = nullptr,
+                 struct ggml_context* output_ctx      = nullptr) {
         if (!offload_params_to_runtime_backend()) {
             LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
             return;
@@ -1842,12 +1842,12 @@ struct GGMLRunner {
         ggml_graph_print(gf);
 #endif
         copy_cache_tensors_to_cache_buffer();
-        if (output != NULL) {
+        if (output != nullptr) {
             auto result = ggml_get_tensor(compute_ctx, final_result_name.c_str());
-            if (*output == NULL && output_ctx != NULL) {
+            if (*output == nullptr && output_ctx != nullptr) {
                 *output = ggml_dup_tensor(output_ctx, result);
             }
-            if (*output != NULL) {
+            if (*output != nullptr) {
                 ggml_backend_tensor_get_and_sync(runtime_backend, result, (*output)->data, 0, ggml_nbytes(*output));
             }
         }
@@ -1994,7 +1994,7 @@ class Linear : public UnaryBlock {
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
-        struct ggml_tensor* b = NULL;
+        struct ggml_tensor* b = nullptr;
         if (bias) {
             b = params["bias"];
         }
@@ -2098,7 +2098,7 @@ class Conv2d : public UnaryBlock {
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
-        struct ggml_tensor* b = NULL;
+        struct ggml_tensor* b = nullptr;
         if (bias) {
             b = params["bias"];
         }
@@ -2156,7 +2156,7 @@ class Conv3dnx1x1 : public UnaryBlock {
     // result: [N, OC, OD, OH*OW]
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
-        struct ggml_tensor* b = NULL;
+        struct ggml_tensor* b = nullptr;
         if (bias) {
             b = params["bias"];
         }
@@ -2205,7 +2205,7 @@ class Conv3d : public UnaryBlock {
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
-        struct ggml_tensor* b = NULL;
+        struct ggml_tensor* b = nullptr;
         if (bias) {
             b = params["bias"];
         }
@@ -2245,8 +2245,8 @@ class LayerNorm : public UnaryBlock {
           bias(bias) {}
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = NULL;
-        struct ggml_tensor* b = NULL;
+        struct ggml_tensor* w = nullptr;
+        struct ggml_tensor* b = nullptr;
 
         if (elementwise_affine) {
             w = params["weight"];
@@ -2285,8 +2285,8 @@ class GroupNorm : public GGMLBlock {
           affine(affine) {}
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = NULL;
-        struct ggml_tensor* b = NULL;
+        struct ggml_tensor* w = nullptr;
+        struct ggml_tensor* b = nullptr;
         if (affine) {
             w = params["weight"];
             b = params["bias"];
@@ -2369,7 +2369,7 @@ class MultiheadAttention : public GGMLBlock {
         struct ggml_tensor* k = k_proj->forward(ctx, x);
         struct ggml_tensor* v = v_proj->forward(ctx, x);
 
-        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, mask);  // [N, n_token, embed_dim]
+        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, nullptr, mask);  // [N, n_token, embed_dim]
 
         x = out_proj->forward(ctx, x);  // [N, n_token, embed_dim]
         return x;
diff --git a/lora.hpp b/lora.hpp
index 1fce9569..0d403d58 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -100,7 +100,7 @@ struct LoraModel : public GGMLRunner {
     bool load_failed                = false;
     bool applied                    = false;
     std::vector<int> zero_index_vec = {0};
-    ggml_tensor* zero_index         = NULL;
+    ggml_tensor* zero_index         = nullptr;
     enum lora_t type                = REGULAR;
 
     LoraModel(ggml_backend_t backend,
@@ -112,7 +112,7 @@ struct LoraModel : public GGMLRunner {
         }
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "lora";
     }
 
@@ -287,7 +287,7 @@ struct LoraModel : public GGMLRunner {
                 if (is_qkvm_split) {
                     key = key.substr(sizeof("SPLIT_L|") - 1);
                 }
-                struct ggml_tensor* updown = NULL;
+                struct ggml_tensor* updown = nullptr;
                 float scale_value          = 1.0f;
                 std::string full_key       = lora_pre[type] + key;
                 if (is_bias) {
@@ -314,13 +314,13 @@ struct LoraModel : public GGMLRunner {
                     }
                     std::string alpha_name = "";
 
-                    ggml_tensor* hada_1_mid  = NULL;  // tau for tucker decomposition
-                    ggml_tensor* hada_1_up   = NULL;
-                    ggml_tensor* hada_1_down = NULL;
+                    ggml_tensor* hada_1_mid  = nullptr;  // tau for tucker decomposition
+                    ggml_tensor* hada_1_up   = nullptr;
+                    ggml_tensor* hada_1_down = nullptr;
 
-                    ggml_tensor* hada_2_mid  = NULL;  // tau for tucker decomposition
-                    ggml_tensor* hada_2_up   = NULL;
-                    ggml_tensor* hada_2_down = NULL;
+                    ggml_tensor* hada_2_mid  = nullptr;  // tau for tucker decomposition
+                    ggml_tensor* hada_2_up   = nullptr;
+                    ggml_tensor* hada_2_down = nullptr;
 
                     std::string hada_1_mid_name  = "";
                     std::string hada_1_down_name = "";
@@ -368,7 +368,7 @@ struct LoraModel : public GGMLRunner {
                     applied_lora_tensors.insert(hada_2_up_name);
 
                     applied_lora_tensors.insert(alpha_name);
-                    if (hada_1_up == NULL || hada_1_down == NULL || hada_2_up == NULL || hada_2_down == NULL) {
+                    if (hada_1_up == nullptr || hada_1_down == nullptr || hada_2_up == nullptr || hada_2_down == nullptr) {
                         continue;
                     }
 
@@ -394,8 +394,8 @@ struct LoraModel : public GGMLRunner {
 
                     std::string alpha_name = full_key + ".alpha";
 
-                    ggml_tensor* lokr_w1 = NULL;
-                    ggml_tensor* lokr_w2 = NULL;
+                    ggml_tensor* lokr_w1 = nullptr;
+                    ggml_tensor* lokr_w2 = nullptr;
 
                     std::string lokr_w1_name = "";
                     std::string lokr_w2_name = "";
@@ -407,8 +407,8 @@ struct LoraModel : public GGMLRunner {
                         lokr_w1 = to_f32(compute_ctx, lora_tensors[lokr_w1_name]);
                         applied_lora_tensors.insert(lokr_w1_name);
                     } else {
-                        ggml_tensor* down     = NULL;
-                        ggml_tensor* up       = NULL;
+                        ggml_tensor* down     = nullptr;
+                        ggml_tensor* up       = nullptr;
                         std::string down_name = lokr_w1_name + "_b";
                         std::string up_name   = lokr_w1_name + "_a";
                         if (lora_tensors.find(down_name) != lora_tensors.end()) {
@@ -432,8 +432,8 @@ struct LoraModel : public GGMLRunner {
                         lokr_w2 = to_f32(compute_ctx, lora_tensors[lokr_w2_name]);
                         applied_lora_tensors.insert(lokr_w2_name);
                     } else {
-                        ggml_tensor* down     = NULL;
-                        ggml_tensor* up       = NULL;
+                        ggml_tensor* down     = nullptr;
+                        ggml_tensor* up       = nullptr;
                         std::string down_name = lokr_w2_name + "_b";
                         std::string up_name   = lokr_w2_name + "_a";
                         if (lora_tensors.find(down_name) != lora_tensors.end()) {
@@ -460,9 +460,9 @@ struct LoraModel : public GGMLRunner {
 
                 } else {
                     // LoRA mode
-                    ggml_tensor* lora_mid  = NULL;  // tau for tucker decomposition
-                    ggml_tensor* lora_up   = NULL;
-                    ggml_tensor* lora_down = NULL;
+                    ggml_tensor* lora_mid  = nullptr;  // tau for tucker decomposition
+                    ggml_tensor* lora_up   = nullptr;
+                    ggml_tensor* lora_down = nullptr;
 
                     std::string alpha_name         = "";
                     std::string scale_name         = "";
@@ -497,12 +497,12 @@ struct LoraModel : public GGMLRunner {
                             auto split_k_alpha_name = full_key + "k" + suffix + ".alpha";
                             auto split_v_alpha_name = full_key + "v" + suffix + ".alpha";
 
-                            ggml_tensor* lora_q_down = NULL;
-                            ggml_tensor* lora_q_up   = NULL;
-                            ggml_tensor* lora_k_down = NULL;
-                            ggml_tensor* lora_k_up   = NULL;
-                            ggml_tensor* lora_v_down = NULL;
-                            ggml_tensor* lora_v_up   = NULL;
+                            ggml_tensor* lora_q_down = nullptr;
+                            ggml_tensor* lora_q_up   = nullptr;
+                            ggml_tensor* lora_k_down = nullptr;
+                            ggml_tensor* lora_k_up   = nullptr;
+                            ggml_tensor* lora_v_down = nullptr;
+                            ggml_tensor* lora_v_up   = nullptr;
 
                             lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
 
@@ -633,15 +633,15 @@ struct LoraModel : public GGMLRunner {
                             auto split_v_alpha_name = full_key + "attn.to_v" + ".alpha";
                             auto split_m_alpha_name = full_key + "proj_mlp" + ".alpha";
 
-                            ggml_tensor* lora_q_down = NULL;
-                            ggml_tensor* lora_q_up   = NULL;
-                            ggml_tensor* lora_k_down = NULL;
-                            ggml_tensor* lora_k_up   = NULL;
-                            ggml_tensor* lora_v_down = NULL;
-                            ggml_tensor* lora_v_up   = NULL;
+                            ggml_tensor* lora_q_down = nullptr;
+                            ggml_tensor* lora_q_up   = nullptr;
+                            ggml_tensor* lora_k_down = nullptr;
+                            ggml_tensor* lora_k_up   = nullptr;
+                            ggml_tensor* lora_v_down = nullptr;
+                            ggml_tensor* lora_v_up   = nullptr;
 
-                            ggml_tensor* lora_m_down = NULL;
-                            ggml_tensor* lora_m_up   = NULL;
+                            ggml_tensor* lora_m_down = nullptr;
+                            ggml_tensor* lora_m_up   = nullptr;
 
                             lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
 
@@ -809,7 +809,7 @@ struct LoraModel : public GGMLRunner {
                         }
                     }
 
-                    if (lora_up == NULL || lora_down == NULL) {
+                    if (lora_up == nullptr || lora_down == nullptr) {
                         continue;
                     }
                     // calc_scale
diff --git a/ltxv.hpp b/ltxv.hpp
index 6ff66811..fdd190f0 100644
--- a/ltxv.hpp
+++ b/ltxv.hpp
@@ -13,10 +13,10 @@ namespace LTXV {
     public:
         CausalConv3d(int64_t in_channels,
                      int64_t out_channels,
-                     int kernel_size        = 3,
-                     std::tuple<int> stride = {1, 1, 1},
-                     int dilation           = 1,
-                     bool bias              = true) {
+                     int kernel_size                  = 3,
+                     std::tuple<int, int, int> stride = {1, 1, 1},
+                     int dilation                     = 1,
+                     bool bias                        = true) {
             time_kernel_size = kernel_size / 2;
             blocks["conv"]   = std::shared_ptr<GGMLBlock>(new Conv3d(in_channels,
                                                                      out_channels,
diff --git a/mmdit.hpp b/mmdit.hpp
index d9d19340..8442592a 100644
--- a/mmdit.hpp
+++ b/mmdit.hpp
@@ -1,6 +1,8 @@
 #ifndef __MMDIT_HPP__
 #define __MMDIT_HPP__
 
+#include <memory>
+
 #include "ggml_extend.hpp"
 #include "model.h"
 
@@ -208,8 +210,8 @@ class SelfAttention : public GGMLBlock {
                                 ggml_backend_t backend,
                                 struct ggml_tensor* x) {
         auto qkv = pre_attention(ctx, x);
-        x        = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, true);  // [N, n_token, dim]
-        x        = post_attention(ctx, x);                                                                            // [N, n_token, dim]
+        x        = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, true);  // [N, n_token, dim]
+        x        = post_attention(ctx, x);                                                                               // [N, n_token, dim]
         return x;
     }
 };
@@ -347,7 +349,7 @@ struct DismantledBlock : public GGMLBlock {
             auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
             auto qkv     = attn->pre_attention(ctx, attn_in);
 
-            return {qkv, {NULL, NULL, NULL, NULL, NULL}};
+            return {qkv, {nullptr, nullptr, nullptr, nullptr, nullptr}};
         }
     }
 
@@ -439,8 +441,8 @@ struct DismantledBlock : public GGMLBlock {
             auto qkv2          = std::get<1>(qkv_intermediates);
             auto intermediates = std::get<2>(qkv_intermediates);
 
-            auto attn_out  = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn);     // [N, n_token, dim]
-            auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
+            auto attn_out  = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, flash_attn);     // [N, n_token, dim]
+            auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, flash_attn);  // [N, n_token, dim]
             x              = post_attention_x(ctx,
                                               attn_out,
                                               attn2_out,
@@ -456,7 +458,7 @@ struct DismantledBlock : public GGMLBlock {
             auto qkv               = qkv_intermediates.first;
             auto intermediates     = qkv_intermediates.second;
 
-            auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
+            auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, flash_attn);  // [N, n_token, dim]
             x             = post_attention(ctx,
                                            attn_out,
                                            intermediates[0],
@@ -502,8 +504,8 @@ block_mixing(struct ggml_context* ctx,
         qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
     }
 
-    auto attn         = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, NULL, false, false, flash_attn);  // [N, n_context + n_token, hidden_size]
-    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                                                              // [n_context + n_token, N, hidden_size]
+    auto attn         = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, flash_attn);  // [N, n_context + n_token, hidden_size]
+    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                                                                 // [n_context + n_token, N, hidden_size]
     auto context_attn = ggml_view_3d(ctx,
                                      attn,
                                      attn->ne[0],
@@ -532,7 +534,7 @@ block_mixing(struct ggml_context* ctx,
                                                 context_intermediates[3],
                                                 context_intermediates[4]);
     } else {
-        context = NULL;
+        context = nullptr;
     }
 
     if (x_block->self_attn) {
@@ -645,7 +647,7 @@ struct MMDiT : public GGMLBlock {
     std::string qk_norm;
     bool flash_attn = false;
 
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
         enum ggml_type wtype = GGML_TYPE_F32;
         params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
     }
@@ -823,8 +825,8 @@ struct MMDiT : public GGMLBlock {
                                 ggml_backend_t backend,
                                 struct ggml_tensor* x,
                                 struct ggml_tensor* t,
-                                struct ggml_tensor* y        = NULL,
-                                struct ggml_tensor* context  = NULL,
+                                struct ggml_tensor* y        = nullptr,
+                                struct ggml_tensor* context  = nullptr,
                                 std::vector<int> skip_layers = std::vector<int>()) {
         // Forward pass of DiT.
         // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
@@ -843,14 +845,14 @@ struct MMDiT : public GGMLBlock {
         x                = ggml_add(ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]
 
         auto c = t_embedder->forward(ctx, t);  // [N, hidden_size]
-        if (y != NULL && adm_in_channels != -1) {
+        if (y != nullptr && adm_in_channels != -1) {
             auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);
 
             y = y_embedder->forward(ctx, y);  // [N, hidden_size]
             c = ggml_add(ctx, c, y);
         }
 
-        if (context != NULL) {
+        if (context != nullptr) {
             auto context_embedder = std::dynamic_pointer_cast<Linear>(blocks["context_embedder"]);
 
             context = context_embedder->forward(ctx, context);  // [N, L, D] aka [N, L, 1536]
@@ -875,7 +877,7 @@ struct MMDiTRunner : public GGMLRunner {
         mmdit.init(params_ctx, tensor_types, prefix);
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "mmdit";
     }
 
@@ -913,8 +915,8 @@ struct MMDiTRunner : public GGMLRunner {
                  struct ggml_tensor* timesteps,
                  struct ggml_tensor* context,
                  struct ggml_tensor* y,
-                 struct ggml_tensor** output     = NULL,
-                 struct ggml_context* output_ctx = NULL,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr,
                  std::vector<int> skip_layers    = std::vector<int>()) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
@@ -930,11 +932,11 @@ struct MMDiTRunner : public GGMLRunner {
     void test() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
         struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != NULL);
+        GGML_ASSERT(work_ctx != nullptr);
 
         {
             // cpu f16: pass
@@ -955,7 +957,7 @@ struct MMDiTRunner : public GGMLRunner {
             ggml_set_f32(y, 0.01f);
             // print_ggml_tensor(y);
 
-            struct ggml_tensor* out = NULL;
+            struct ggml_tensor* out = nullptr;
 
             int t0 = ggml_time_ms();
             compute(8, x, timesteps, context, y, &out, work_ctx);
@@ -970,7 +972,7 @@ struct MMDiTRunner : public GGMLRunner {
         // ggml_backend_t backend    = ggml_backend_cuda_init(0);
         ggml_backend_t backend             = ggml_backend_cpu_init();
         ggml_type model_data_type          = GGML_TYPE_F16;
-        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, false, false));
+        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false, false);
         {
             LOG_INFO("loading from '%s'", file_path.c_str());
 
diff --git a/model.cpp b/model.cpp
index b45493cc..b877915c 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1,7 +1,7 @@
-#include <stdarg.h>
 #include <algorithm>
 #include <atomic>
 #include <chrono>
+#include <cstdarg>
 #include <fstream>
 #include <functional>
 #include <mutex>
@@ -869,7 +869,6 @@ uint16_t f8_e5m2_to_f16(uint8_t fp8) {
     }
 
     if (exponent == 0) {  // subnormal numbers
-        fp16_exponent = 0;
         fp16_mantissa = (mantissa << 8);
         return fp16_sign | fp16_mantissa;
     }
@@ -948,7 +947,7 @@ void convert_tensor(void* src,
             ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n);
         } else {
             auto qtype = ggml_get_type_traits(src_type);
-            if (qtype->to_float == NULL) {
+            if (qtype->to_float == nullptr) {
                 throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
                                                 ggml_type_name(src_type)));
             }
@@ -958,7 +957,7 @@ void convert_tensor(void* src,
         // src_type == GGML_TYPE_F16 => dst_type is quantized
         // src_type is quantized => dst_type == GGML_TYPE_F16 or dst_type is quantized
         auto qtype = ggml_get_type_traits(src_type);
-        if (qtype->to_float == NULL) {
+        if (qtype->to_float == nullptr) {
             throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
                                             ggml_type_name(src_type)));
         }
@@ -1020,7 +1019,7 @@ std::map<char, int> unicode_to_byte() {
 
 bool is_zip_file(const std::string& file_path) {
     struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
-    if (zip == NULL) {
+    if (zip == nullptr) {
         return false;
     }
     zip_close(zip);
@@ -1116,8 +1115,8 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
 
-    gguf_context* ctx_gguf_ = NULL;
-    ggml_context* ctx_meta_ = NULL;
+    gguf_context* ctx_gguf_ = nullptr;
+    ggml_context* ctx_meta_ = nullptr;
 
     ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
     if (!ctx_gguf_) {
@@ -1726,7 +1725,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
     size_t file_index = file_paths_.size() - 1;
 
     struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
-    if (zip == NULL) {
+    if (zip == nullptr) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         return false;
     }
@@ -1739,7 +1738,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
             if (pos != std::string::npos) {
                 std::string dir = name.substr(0, pos);
                 printf("ZIP %d, name = %s, dir = %s \n", i, name.c_str(), dir.c_str());
-                void* pkl_data = NULL;
+                void* pkl_data = nullptr;
                 size_t pkl_size;
                 zip_entry_read(zip, &pkl_data, &pkl_size);
 
@@ -1892,24 +1891,25 @@ SDVersion ModelLoader::get_sd_version() {
     return VERSION_COUNT;
 }
 
-ggml_type ModelLoader::get_sd_wtype() {
+std::map<ggml_type, uint32_t> ModelLoader::get_wtype_stat() {
+    std::map<ggml_type, uint32_t> wtype_stat;
     for (auto& tensor_storage : tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
         }
 
-        if (ggml_is_quantized(tensor_storage.type)) {
-            return tensor_storage.type;
-        }
-
-        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
-            return tensor_storage.type;
+        auto iter = wtype_stat.find(tensor_storage.type);
+        if (iter != wtype_stat.end()) {
+            iter->second++;
+        } else {
+            wtype_stat[tensor_storage.type] = 1;
         }
     }
-    return GGML_TYPE_COUNT;
+    return wtype_stat;
 }
 
-ggml_type ModelLoader::get_conditioner_wtype() {
+std::map<ggml_type, uint32_t> ModelLoader::get_conditioner_wtype_stat() {
+    std::map<ggml_type, uint32_t> wtype_stat;
     for (auto& tensor_storage : tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
@@ -1922,18 +1922,18 @@ ggml_type ModelLoader::get_conditioner_wtype() {
             continue;
         }
 
-        if (ggml_is_quantized(tensor_storage.type)) {
-            return tensor_storage.type;
-        }
-
-        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
-            return tensor_storage.type;
+        auto iter = wtype_stat.find(tensor_storage.type);
+        if (iter != wtype_stat.end()) {
+            iter->second++;
+        } else {
+            wtype_stat[tensor_storage.type] = 1;
         }
     }
-    return GGML_TYPE_COUNT;
+    return wtype_stat;
 }
 
-ggml_type ModelLoader::get_diffusion_model_wtype() {
+std::map<ggml_type, uint32_t> ModelLoader::get_diffusion_model_wtype_stat() {
+    std::map<ggml_type, uint32_t> wtype_stat;
     for (auto& tensor_storage : tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
@@ -1943,18 +1943,18 @@ ggml_type ModelLoader::get_diffusion_model_wtype() {
             continue;
         }
 
-        if (ggml_is_quantized(tensor_storage.type)) {
-            return tensor_storage.type;
-        }
-
-        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
-            return tensor_storage.type;
+        auto iter = wtype_stat.find(tensor_storage.type);
+        if (iter != wtype_stat.end()) {
+            iter->second++;
+        } else {
+            wtype_stat[tensor_storage.type] = 1;
         }
     }
-    return GGML_TYPE_COUNT;
+    return wtype_stat;
 }
 
-ggml_type ModelLoader::get_vae_wtype() {
+std::map<ggml_type, uint32_t> ModelLoader::get_vae_wtype_stat() {
+    std::map<ggml_type, uint32_t> wtype_stat;
     for (auto& tensor_storage : tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
@@ -1965,15 +1965,14 @@ ggml_type ModelLoader::get_vae_wtype() {
             continue;
         }
 
-        if (ggml_is_quantized(tensor_storage.type)) {
-            return tensor_storage.type;
-        }
-
-        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
-            return tensor_storage.type;
+        auto iter = wtype_stat.find(tensor_storage.type);
+        if (iter != wtype_stat.end()) {
+            iter->second++;
+        } else {
+            wtype_stat[tensor_storage.type] = 1;
         }
     }
-    return GGML_TYPE_COUNT;
+    return wtype_stat;
 }
 
 void ModelLoader::set_wtype_override(ggml_type wtype, std::string prefix) {
@@ -2144,10 +2143,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
         for (int i = 0; i < n_threads; ++i) {
             workers.emplace_back([&, file_path, is_zip]() {
                 std::ifstream file;
-                struct zip_t* zip = NULL;
+                struct zip_t* zip = nullptr;
                 if (is_zip) {
                     zip = zip_open(file_path.c_str(), 0, 'r');
-                    if (zip == NULL) {
+                    if (zip == nullptr) {
                         LOG_ERROR("failed to open zip '%s'", file_path.c_str());
                         failed = true;
                         return;
@@ -2172,7 +2171,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                     }
 
                     const TensorStorage& tensor_storage = *file_tensors[idx];
-                    ggml_tensor* dst_tensor             = NULL;
+                    ggml_tensor* dst_tensor             = nullptr;
 
                     t0 = ggml_time_ms();
 
@@ -2182,7 +2181,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                         break;
                     }
 
-                    if (dst_tensor == NULL) {
+                    if (dst_tensor == nullptr) {
                         t1 = ggml_time_ms();
                         read_time_ms.fetch_add(t1 - t0);
                         continue;
@@ -2191,7 +2190,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                     size_t nbytes_to_read = tensor_storage.nbytes_to_read();
 
                     auto read_data = [&](char* buf, size_t n) {
-                        if (zip != NULL) {
+                        if (zip != nullptr) {
                             zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
                             size_t entry_size = zip_entry_size(zip);
                             if (entry_size != n) {
@@ -2215,7 +2214,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                         }
                     };
 
-                    if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
+                    if (dst_tensor->buffer == nullptr || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
                         if (tensor_storage.type == dst_tensor->type) {
                             GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
                             if (tensor_storage.is_f64 || tensor_storage.is_i64) {
@@ -2317,7 +2316,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                         }
                     }
                 }
-                if (zip != NULL) {
+                if (zip != nullptr) {
                     zip_close(zip);
                 }
             });
@@ -2507,7 +2506,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
     mem_size += get_params_mem_size(backend, type);
     LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
-    ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false});
+    ggml_context* ggml_ctx = ggml_init({mem_size, nullptr, false});
 
     gguf_context* gguf_ctx = gguf_init_empty();
 
@@ -2533,7 +2532,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
 
         std::lock_guard<std::mutex> lock(tensor_mutex);
         ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
-        if (tensor == NULL) {
+        if (tensor == nullptr) {
             LOG_ERROR("ggml_new_tensor failed");
             return false;
         }
@@ -2566,7 +2565,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
 
 int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) {
     size_t alignment = 128;
-    if (backend != NULL) {
+    if (backend != nullptr) {
         alignment = ggml_backend_get_alignment(backend);
     }
     int64_t mem_size = 0;
@@ -2596,7 +2595,7 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
         return false;
     }
 
-    if (vae_path != NULL && strlen(vae_path) > 0) {
+    if (vae_path != nullptr && strlen(vae_path) > 0) {
         if (!model_loader.init_from_file(vae_path, "vae.")) {
             LOG_ERROR("init model loader from file failed: '%s'", vae_path);
             return false;
diff --git a/model.h b/model.h
index 069bb0c2..fe77a219 100644
--- a/model.h
+++ b/model.h
@@ -8,6 +8,7 @@
 #include <sstream>
 #include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "ggml-backend.h"
@@ -140,8 +141,8 @@ struct TensorStorage {
 
     TensorStorage() = default;
 
-    TensorStorage(const std::string& name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
-        : name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
+    TensorStorage(std::string name, ggml_type type, const int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
+        : name(std::move(name)), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
         for (int i = 0; i < n_dims; i++) {
             this->ne[i] = ne[i];
         }
@@ -259,10 +260,10 @@ class ModelLoader {
     bool init_from_file(const std::string& file_path, const std::string& prefix = "");
     bool model_is_unet();
     SDVersion get_sd_version();
-    ggml_type get_sd_wtype();
-    ggml_type get_conditioner_wtype();
-    ggml_type get_diffusion_model_wtype();
-    ggml_type get_vae_wtype();
+    std::map<ggml_type, uint32_t> get_wtype_stat();
+    std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
+    std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
+    std::map<ggml_type, uint32_t> get_vae_wtype_stat();
     void set_wtype_override(ggml_type wtype, std::string prefix = "");
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
diff --git a/pmid.hpp b/pmid.hpp
index 63029cbc..5ad7096a 100644
--- a/pmid.hpp
+++ b/pmid.hpp
@@ -472,8 +472,8 @@ struct PhotoMakerIDEncoder : public GGMLRunner {
         struct ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
         struct ggml_tensor* id_embeds_d       = to_backend(id_embeds);
 
-        struct ggml_tensor* left  = NULL;
-        struct ggml_tensor* right = NULL;
+        struct ggml_tensor* left  = nullptr;
+        struct ggml_tensor* right = nullptr;
         for (int i = 0; i < class_tokens_mask.size(); i++) {
             if (class_tokens_mask[i]) {
                 // printf(" 1,");
@@ -528,7 +528,7 @@ struct PhotoMakerIDEncoder : public GGMLRunner {
                 }
             }
         }
-        struct ggml_tensor* updated_prompt_embeds = NULL;
+        struct ggml_tensor* updated_prompt_embeds = nullptr;
         if (pm_version == PM_VERSION_1)
             updated_prompt_embeds = id_encoder.forward(ctx0,
                                                        runtime_backend,
@@ -638,7 +638,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
         pos = tensors.find("pmid.id_embeds");
         if (pos != tensors.end())
             return pos->second;
-        return NULL;
+        return nullptr;
     }
 };
 
diff --git a/preprocessing.hpp b/preprocessing.hpp
index 552aa642..11c3a21b 100644
--- a/preprocessing.hpp
+++ b/preprocessing.hpp
@@ -7,7 +7,7 @@
 void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
     struct ggml_init_params params;
     params.mem_size                 = 80 * input->ne[0] * input->ne[1];  // 20M for 512x512
-    params.mem_buffer               = NULL;
+    params.mem_buffer               = nullptr;
     params.no_alloc                 = false;
     struct ggml_context* ctx0       = ggml_init(params);
     struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
@@ -165,7 +165,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
 bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
     struct ggml_init_params params;
     params.mem_size               = static_cast<size_t>(40 * img.width * img.height);  // 10MB for 512x512
-    params.mem_buffer             = NULL;
+    params.mem_buffer             = nullptr;
     params.no_alloc               = false;
     struct ggml_context* work_ctx = ggml_init(params);
 
diff --git a/qwen_image.hpp b/qwen_image.hpp
index ce4e62dc..2d3cd230 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -1,6 +1,8 @@
 #ifndef __QWEN_IMAGE_HPP__
 #define __QWEN_IMAGE_HPP__
 
+#include <memory>
+
 #include "common.hpp"
 #include "flux.hpp"
 #include "ggml_extend.hpp"
@@ -534,12 +536,12 @@ namespace Qwen {
                     continue;
                 }
             }
-            LOG_ERROR("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
+            LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
             qwen_image = QwenImageModel(qwen_image_params);
             qwen_image.init(params_ctx, tensor_types, prefix);
         }
 
-        std::string get_desc() {
+        std::string get_desc() override {
             return "qwen_image";
         }
 
@@ -577,7 +579,7 @@ namespace Qwen {
             auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
             // pe->data = pe_vec.data();
             // print_ggml_tensor(pe, true, "pe");
-            // pe->data = NULL;
+            // pe->data = nullptr;
             set_backend_tensor_data(pe, pe_vec.data());
 
             struct ggml_tensor* out = qwen_image.forward(compute_ctx,
@@ -599,8 +601,8 @@ namespace Qwen {
                      struct ggml_tensor* context,
                      std::vector<ggml_tensor*> ref_latents = {},
                      bool increase_ref_index               = false,
-                     struct ggml_tensor** output           = NULL,
-                     struct ggml_context* output_ctx       = NULL) {
+                     struct ggml_tensor** output           = nullptr,
+                     struct ggml_context* output_ctx       = nullptr) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
@@ -614,11 +616,11 @@ namespace Qwen {
         void test() {
             struct ggml_init_params params;
             params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
-            params.mem_buffer = NULL;
+            params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
             struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != NULL);
+            GGML_ASSERT(work_ctx != nullptr);
 
             {
                 // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
@@ -634,7 +636,7 @@ namespace Qwen {
                 auto context = load_tensor_from_file(work_ctx, "./qwen_image_context.bin");
                 print_ggml_tensor(context);
 
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
 
                 int t0 = ggml_time_ms();
                 compute(8, x, timesteps, context, {}, false, &out, work_ctx);
@@ -666,12 +668,12 @@ namespace Qwen {
                 }
             }
 
-            std::shared_ptr<QwenImageRunner> qwen_image = std::shared_ptr<QwenImageRunner>(new QwenImageRunner(backend,
-                                                                                                               false,
-                                                                                                               tensor_types,
-                                                                                                               "model.diffusion_model",
-                                                                                                               VERSION_QWEN_IMAGE,
-                                                                                                               true));
+            std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
+                                                                                            false,
+                                                                                            tensor_types,
+                                                                                            "model.diffusion_model",
+                                                                                            VERSION_QWEN_IMAGE,
+                                                                                            true);
 
             qwen_image->alloc_params_buffer();
             std::map<std::string, ggml_tensor*> tensors;
diff --git a/qwenvl.hpp b/qwenvl.hpp
index 881f54d7..ab04435a 100644
--- a/qwenvl.hpp
+++ b/qwenvl.hpp
@@ -5,11 +5,13 @@
 #include <fstream>
 #include <iostream>
 #include <map>
+#include <memory>
 #include <optional>
 #include <regex>
 #include <set>
 #include <sstream>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "clip.hpp"
@@ -589,7 +591,7 @@ namespace Qwen {
                               int64_t window_size,
                               std::set<int> fullatt_block_indexes = {7, 15, 23, 31},
                               float eps                           = 1e-6f)
-            : num_layers(num_layers), fullatt_block_indexes(fullatt_block_indexes), spatial_merge_size(spatial_merge_size) {
+            : num_layers(num_layers), fullatt_block_indexes(std::move(fullatt_block_indexes)), spatial_merge_size(spatial_merge_size) {
             blocks["patch_embed"] = std::shared_ptr<GGMLBlock>(new Qwen2_5_VisionPatchEmbed(llama_cpp_style,
                                                                                             patch_size,
                                                                                             temporal_patch_size,
@@ -949,7 +951,7 @@ namespace Qwen {
             model.init(params_ctx, tensor_types, prefix);
         }
 
-        std::string get_desc() {
+        std::string get_desc() override {
             return "qwenvl2.5";
         }
 
@@ -1011,7 +1013,7 @@ namespace Qwen {
                      struct ggml_tensor* input_ids,
                      std::vector<std::pair<int, ggml_tensor*>> image_embeds,
                      ggml_tensor** output,
-                     ggml_context* output_ctx = NULL) {
+                     ggml_context* output_ctx = nullptr) {
             auto get_graph = [&]() -> struct ggml_cgraph* {
                 return build_graph(input_ids, image_embeds);
             };
@@ -1162,7 +1164,7 @@ namespace Qwen {
             auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len);
             // pe->data = pe_vec.data();
             // print_ggml_tensor(pe);
-            // pe->data = NULL;
+            // pe->data = nullptr;
             set_backend_tensor_data(pe, pe_vec.data());
 
             struct ggml_tensor* hidden_states = vision_forward(compute_ctx,
@@ -1180,7 +1182,7 @@ namespace Qwen {
         void encode_image(const int n_threads,
                           struct ggml_tensor* image,
                           ggml_tensor** output,
-                          ggml_context* output_ctx = NULL) {
+                          ggml_context* output_ctx = nullptr) {
             auto get_graph = [&]() -> struct ggml_cgraph* {
                 return build_encode_image_graph(image);
             };
@@ -1246,11 +1248,11 @@ namespace Qwen {
         void test() {
             struct ggml_init_params params;
             params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1GB
-            params.mem_buffer = NULL;
+            params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
             struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != NULL);
+            GGML_ASSERT(work_ctx != nullptr);
             bool test_vit              = true;
             bool test_decoder_with_vit = true;
 
@@ -1259,7 +1261,7 @@ namespace Qwen {
                 {
                     auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
                     print_ggml_tensor(image, false, "image");
-                    struct ggml_tensor* out = NULL;
+                    struct ggml_tensor* out = nullptr;
 
                     int t0 = ggml_time_ms();
                     model.encode_image(8, image, &out, work_ctx);
@@ -1295,7 +1297,7 @@ namespace Qwen {
                 }
                 printf("\n");
                 auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
 
                 int t0 = ggml_time_ms();
                 model.compute(8, input_ids, image_embeds, &out, work_ctx);
@@ -1308,7 +1310,7 @@ namespace Qwen {
                 // ggml_set_f32(image, 0.f);
                 auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin");
                 print_ggml_tensor(image, false, "image");
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
 
                 int t0 = ggml_time_ms();
                 model.encode_image(8, image, &out, work_ctx);
@@ -1330,7 +1332,7 @@ namespace Qwen {
                 }
                 printf("\n");
                 auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
 
                 int t0 = ggml_time_ms();
                 model.compute(8, input_ids, {}, &out, work_ctx);
@@ -1361,11 +1363,11 @@ namespace Qwen {
                 }
             }
 
-            std::shared_ptr<Qwen2_5_VLEmbedder> qwenvl = std::shared_ptr<Qwen2_5_VLEmbedder>(new Qwen2_5_VLEmbedder(backend,
-                                                                                                                    false,
-                                                                                                                    tensor_types,
-                                                                                                                    "qwen2vl",
-                                                                                                                    true));
+            std::shared_ptr<Qwen2_5_VLEmbedder> qwenvl = std::make_shared<Qwen2_5_VLEmbedder>(backend,
+                                                                                              false,
+                                                                                              tensor_types,
+                                                                                              "qwen2vl",
+                                                                                              true);
 
             qwenvl->alloc_params_buffer();
             std::map<std::string, ggml_tensor*> tensors;
diff --git a/rng.hpp b/rng.hpp
index 3340be61..accc4088 100644
--- a/rng.hpp
+++ b/rng.hpp
@@ -15,11 +15,11 @@ class STDDefaultRNG : public RNG {
     std::default_random_engine generator;
 
 public:
-    void manual_seed(uint64_t seed) {
+    void manual_seed(uint64_t seed) override {
         generator.seed((unsigned int)seed);
     }
 
-    std::vector<float> randn(uint32_t n) {
+    std::vector<float> randn(uint32_t n) override {
         std::vector<float> result;
         float mean   = 0.0;
         float stddev = 1.0;
diff --git a/rng_philox.hpp b/rng_philox.hpp
index 33fea9c5..58da0703 100644
--- a/rng_philox.hpp
+++ b/rng_philox.hpp
@@ -93,12 +93,12 @@ class PhiloxRNG : public RNG {
         this->offset = 0;
     }
 
-    void manual_seed(uint64_t seed) {
+    void manual_seed(uint64_t seed) override {
         this->seed   = seed;
         this->offset = 0;
     }
 
-    std::vector<float> randn(uint32_t n) {
+    std::vector<float> randn(uint32_t n) override {
         std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
         for (uint32_t i = 0; i < n; i++) {
             counter[0][i] = this->offset;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 87b6a377..8fb88f48 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -82,14 +82,10 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
 
 class StableDiffusionGGML {
 public:
-    ggml_backend_t backend             = NULL;  // general backend
-    ggml_backend_t clip_backend        = NULL;
-    ggml_backend_t control_net_backend = NULL;
-    ggml_backend_t vae_backend         = NULL;
-    ggml_type model_wtype              = GGML_TYPE_COUNT;
-    ggml_type conditioner_wtype        = GGML_TYPE_COUNT;
-    ggml_type diffusion_model_wtype    = GGML_TYPE_COUNT;
-    ggml_type vae_wtype                = GGML_TYPE_COUNT;
+    ggml_backend_t backend             = nullptr;  // general backend
+    ggml_backend_t clip_backend        = nullptr;
+    ggml_backend_t control_net_backend = nullptr;
+    ggml_backend_t vae_backend         = nullptr;
 
     SDVersion version;
     bool vae_decode_only         = false;
@@ -105,7 +101,7 @@ class StableDiffusionGGML {
     std::shared_ptr<DiffusionModel> high_noise_diffusion_model;
     std::shared_ptr<VAE> first_stage_model;
     std::shared_ptr<TinyAutoEncoder> tae_first_stage;
-    std::shared_ptr<ControlNet> control_net = NULL;
+    std::shared_ptr<ControlNet> control_net = nullptr;
     std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
     std::shared_ptr<LoraModel> pmid_lora;
     std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
@@ -294,37 +290,33 @@ class StableDiffusionGGML {
         ggml_type wtype = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
                               ? (ggml_type)sd_ctx_params->wtype
                               : GGML_TYPE_COUNT;
-        if (wtype == GGML_TYPE_COUNT) {
-            model_wtype = model_loader.get_sd_wtype();
-            if (model_wtype == GGML_TYPE_COUNT) {
-                model_wtype = GGML_TYPE_F32;
-                LOG_WARN("can not get mode wtype frome weight, use f32");
-            }
-            conditioner_wtype = model_loader.get_conditioner_wtype();
-            if (conditioner_wtype == GGML_TYPE_COUNT) {
-                conditioner_wtype = wtype;
-            }
-            diffusion_model_wtype = model_loader.get_diffusion_model_wtype();
-            if (diffusion_model_wtype == GGML_TYPE_COUNT) {
-                diffusion_model_wtype = wtype;
-            }
-            vae_wtype = model_loader.get_vae_wtype();
-
-            if (vae_wtype == GGML_TYPE_COUNT) {
-                vae_wtype = wtype;
-            }
-        } else {
-            model_wtype           = wtype;
-            conditioner_wtype     = wtype;
-            diffusion_model_wtype = wtype;
-            vae_wtype             = wtype;
+        if (wtype != GGML_TYPE_COUNT) {
             model_loader.set_wtype_override(wtype);
         }
 
-        LOG_INFO("Weight type:                 %s", ggml_type_name(model_wtype));
-        LOG_INFO("Conditioner weight type:     %s", ggml_type_name(conditioner_wtype));
-        LOG_INFO("Diffusion model weight type: %s", ggml_type_name(diffusion_model_wtype));
-        LOG_INFO("VAE weight type:             %s", ggml_type_name(vae_wtype));
+        std::map<ggml_type, uint32_t> wtype_stat                 = model_loader.get_wtype_stat();
+        std::map<ggml_type, uint32_t> conditioner_wtype_stat     = model_loader.get_conditioner_wtype_stat();
+        std::map<ggml_type, uint32_t> diffusion_model_wtype_stat = model_loader.get_diffusion_model_wtype_stat();
+        std::map<ggml_type, uint32_t> vae_wtype_stat             = model_loader.get_vae_wtype_stat();
+
+        auto wtype_stat_to_str = [](const std::map<ggml_type, uint32_t>& m, int key_width = 8, int value_width = 5) -> std::string {
+            std::ostringstream oss;
+            bool first = true;
+            for (const auto& [type, count] : m) {
+                if (!first)
+                    oss << "|";
+                first = false;
+                oss << std::right << std::setw(key_width) << ggml_type_name(type)
+                    << ": "
+                    << std::left << std::setw(value_width) << count;
+            }
+            return oss.str();
+        };
+
+        LOG_INFO("Weight type stat:                 %s", wtype_stat_to_str(wtype_stat).c_str());
+        LOG_INFO("Conditioner weight type stat:     %s", wtype_stat_to_str(conditioner_wtype_stat).c_str());
+        LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str());
+        LOG_INFO("VAE weight type stat:             %s", wtype_stat_to_str(vae_wtype_stat).c_str());
 
         LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
 
@@ -536,7 +528,7 @@ class StableDiffusionGGML {
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
             if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
-                ggml_backend_t controlnet_backend = NULL;
+                ggml_backend_t controlnet_backend = nullptr;
                 if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
                     LOG_DEBUG("ControlNet: Using CPU backend");
                     controlnet_backend = ggml_backend_cpu_init();
@@ -592,11 +584,11 @@ class StableDiffusionGGML {
 
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(10 * 1024) * 1024;  // 10M
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = false;
         // LOG_DEBUG("mem_size %u ", params.mem_size);
         struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
-        GGML_ASSERT(ctx != NULL);
+        GGML_ASSERT(ctx != nullptr);
         ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
         calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
 
@@ -869,8 +861,8 @@ class StableDiffusionGGML {
         struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
         ggml_set_f32(timesteps, 999);
 
-        struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL;
-        if (concat != NULL) {
+        struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr;
+        if (concat != nullptr) {
             ggml_set_f32(concat, 0);
         }
 
@@ -938,9 +930,6 @@ class StableDiffusionGGML {
     }
 
     void apply_loras(const std::unordered_map<std::string, float>& lora_state) {
-        if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) {
-            LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
-        }
         std::unordered_map<std::string, float> lora_state_diff;
         for (auto& kv : lora_state) {
             const std::string& lora_name = kv.first;
@@ -987,7 +976,7 @@ class StableDiffusionGGML {
                             ggml_tensor* prompts_embeds,
                             ggml_tensor* id_embeds,
                             std::vector<bool>& class_tokens_mask) {
-        ggml_tensor* res = NULL;
+        ggml_tensor* res = nullptr;
         pmid_model->compute(n_threads, init_img, prompts_embeds, id_embeds, class_tokens_mask, &res, work_ctx);
         return res;
     }
@@ -997,7 +986,7 @@ class StableDiffusionGGML {
                                         bool return_pooled   = true,
                                         int clip_skip        = -1,
                                         bool zero_out_masked = false) {
-        ggml_tensor* output = NULL;
+        ggml_tensor* output = nullptr;
         if (zero_out_masked) {
             if (return_pooled) {
                 output = ggml_new_tensor_1d(work_ctx,
@@ -1015,12 +1004,12 @@ class StableDiffusionGGML {
             sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
             sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
             free(image.data);
-            image.data = NULL;
+            image.data = nullptr;
 
             ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
             sd_image_f32_to_tensor(resized_image, pixel_values, false);
             free(resized_image.data);
-            resized_image.data = NULL;
+            resized_image.data = nullptr;
 
             // print_ggml_tensor(pixel_values);
             clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx);
@@ -1042,7 +1031,7 @@ class StableDiffusionGGML {
         struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked);
 
         // c_concat
-        struct ggml_tensor* c_concat = NULL;
+        struct ggml_tensor* c_concat = nullptr;
         {
             if (zero_out_masked) {
                 c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1);
@@ -1054,10 +1043,10 @@ class StableDiffusionGGML {
                     sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
                     sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height);
                     free(image.data);
-                    image.data = NULL;
+                    image.data = nullptr;
                     sd_image_f32_to_tensor(resized_image, init_img, false);
                     free(resized_image.data);
-                    resized_image.data = NULL;
+                    resized_image.data = nullptr;
                 } else {
                     sd_image_to_tensor(init_image, init_img);
                 }
@@ -1074,7 +1063,7 @@ class StableDiffusionGGML {
         }
 
         // y
-        struct ggml_tensor* y = NULL;
+        struct ggml_tensor* y = nullptr;
         {
             y                            = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels());
             int out_dim                  = 256;
@@ -1093,7 +1082,7 @@ class StableDiffusionGGML {
         if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
             auto new_timesteps = std::vector<float>(init_latent->ne[2], timesteps[0]);
 
-            if (denoise_mask != NULL) {
+            if (denoise_mask != nullptr) {
                 float value = ggml_tensor_get_f32(denoise_mask, 0, 0, 0, 0);
                 if (value == 0.f) {
                     new_timesteps[0] = 0.f;
@@ -1140,8 +1129,8 @@ class StableDiffusionGGML {
                         SDCondition id_cond,
                         std::vector<ggml_tensor*> ref_latents = {},
                         bool increase_ref_index               = false,
-                        ggml_tensor* denoise_mask             = NULL,
-                        ggml_tensor* vace_context             = NULL,
+                        ggml_tensor* denoise_mask             = nullptr,
+                        ggml_tensor* vace_context             = nullptr,
                         float vace_strength                   = 1.f) {
         if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) {
             LOG_WARN("timestep shifting is only supported for SDXL models!");
@@ -1168,15 +1157,15 @@ class StableDiffusionGGML {
 
         struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x);
 
-        bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL;
-        bool has_img_cond      = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL;
+        bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr;
+        bool has_img_cond      = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr;
         bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
 
         // denoise wrapper
         struct ggml_tensor* out_cond     = ggml_dup_tensor(work_ctx, x);
-        struct ggml_tensor* out_uncond   = NULL;
-        struct ggml_tensor* out_skip     = NULL;
-        struct ggml_tensor* out_img_cond = NULL;
+        struct ggml_tensor* out_uncond   = nullptr;
+        struct ggml_tensor* out_skip     = nullptr;
+        struct ggml_tensor* out_img_cond = nullptr;
 
         if (has_unconditioned) {
             out_uncond = ggml_dup_tensor(work_ctx, x);
@@ -1234,7 +1223,7 @@ class StableDiffusionGGML {
 
             std::vector<struct ggml_tensor*> controls;
 
-            if (control_hint != NULL && control_net != NULL) {
+            if (control_hint != nullptr && control_net != nullptr) {
                 control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector);
                 controls = control_net->controls;
                 // print_ggml_tensor(controls[12]);
@@ -1269,10 +1258,10 @@ class StableDiffusionGGML {
                                               &out_cond);
             }
 
-            float* negative_data = NULL;
+            float* negative_data = nullptr;
             if (has_unconditioned) {
                 // uncond
-                if (control_hint != NULL && control_net != NULL) {
+                if (control_hint != nullptr && control_net != nullptr) {
                     control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector);
                     controls = control_net->controls;
                 }
@@ -1286,7 +1275,7 @@ class StableDiffusionGGML {
                 negative_data = (float*)out_uncond->data;
             }
 
-            float* img_cond_data = NULL;
+            float* img_cond_data = nullptr;
             if (has_img_cond) {
                 diffusion_params.context  = img_cond.c_crossattn;
                 diffusion_params.c_concat = img_cond.c_concat;
@@ -1299,7 +1288,7 @@ class StableDiffusionGGML {
 
             int step_count         = sigmas.size();
             bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
-            float* skip_layer_data = NULL;
+            float* skip_layer_data = nullptr;
             if (is_skiplayer_step) {
                 LOG_DEBUG("Skipping layers at step %d\n", step);
                 // skip layer (same as conditionned)
@@ -1490,7 +1479,7 @@ class StableDiffusionGGML {
 
     ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
         int64_t t0          = ggml_time_ms();
-        ggml_tensor* result = NULL;
+        ggml_tensor* result = nullptr;
         int W               = x->ne[0] / 8;
         int H               = x->ne[1] / 8;
         if (vae_tiling_params.enabled && !encode_video) {
@@ -1537,7 +1526,7 @@ class StableDiffusionGGML {
             if (vae_tiling_params.enabled && !encode_video) {
                 // split latent in 32x32 tiles and compute in several steps
                 auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    tae_first_stage->compute(n_threads, in, false, &out, NULL);
+                    tae_first_stage->compute(n_threads, in, false, &out, nullptr);
                 };
                 sd_tiling(x, result, 8, 64, 0.5f, on_tiling);
             } else {
@@ -1612,7 +1601,7 @@ class StableDiffusionGGML {
         int64_t W           = x->ne[0] * 8;
         int64_t H           = x->ne[1] * 8;
         int64_t C           = 3;
-        ggml_tensor* result = NULL;
+        ggml_tensor* result = nullptr;
         if (decode_video) {
             int T = x->ne[2];
             if (sd_version_is_wan(version)) {
@@ -1652,7 +1641,7 @@ class StableDiffusionGGML {
 
                 // split latent in 32x32 tiles and compute in several steps
                 auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    first_stage_model->compute(n_threads, in, true, &out, NULL);
+                    first_stage_model->compute(n_threads, in, true, &out, nullptr);
                 };
                 sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, tile_overlap, on_tiling);
             } else {
@@ -1829,7 +1818,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
     char* buf = (char*)malloc(4096);
     if (!buf)
-        return NULL;
+        return nullptr;
     buf[0] = '\0';
 
     snprintf(buf + strlen(buf), 4096 - strlen(buf),
@@ -1849,7 +1838,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "embedding_dir: %s\n"
              "photo_maker_path: %s\n"
              "vae_decode_only: %s\n"
-             "vae_tiling: %s\n"
              "free_params_immediately: %s\n"
              "n_threads: %d\n"
              "wtype: %s\n"
@@ -1913,7 +1901,7 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) {
 char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
     char* buf = (char*)malloc(4096);
     if (!buf)
-        return NULL;
+        return nullptr;
     buf[0] = '\0';
 
     snprintf(buf + strlen(buf), 4096 - strlen(buf),
@@ -1965,7 +1953,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
 char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
     char* buf = (char*)malloc(4096);
     if (!buf)
-        return NULL;
+        return nullptr;
     buf[0] = '\0';
 
     char* sample_params_str = sd_sample_params_to_str(&sd_img_gen_params->sample_params);
@@ -1981,6 +1969,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
              "seed: %" PRId64
              "batch_count: %d\n"
              "ref_images_count: %d\n"
+             "auto_resize_ref_image: %s\n"
              "increase_ref_index: %s\n"
              "control_strength: %.2f\n"
              "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
@@ -1995,6 +1984,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
              sd_img_gen_params->seed,
              sd_img_gen_params->batch_count,
              sd_img_gen_params->ref_images_count,
+             BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
              BOOL_STR(sd_img_gen_params->increase_ref_index),
              sd_img_gen_params->control_strength,
              sd_img_gen_params->pm_params.style_strength,
@@ -2020,40 +2010,40 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
 }
 
 struct sd_ctx_t {
-    StableDiffusionGGML* sd = NULL;
+    StableDiffusionGGML* sd = nullptr;
 };
 
 sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
-    if (sd_ctx == NULL) {
-        return NULL;
+    if (sd_ctx == nullptr) {
+        return nullptr;
     }
 
     sd_ctx->sd = new StableDiffusionGGML();
-    if (sd_ctx->sd == NULL) {
+    if (sd_ctx->sd == nullptr) {
         free(sd_ctx);
-        return NULL;
+        return nullptr;
     }
 
     if (!sd_ctx->sd->init(sd_ctx_params)) {
         delete sd_ctx->sd;
-        sd_ctx->sd = NULL;
+        sd_ctx->sd = nullptr;
         free(sd_ctx);
-        return NULL;
+        return nullptr;
     }
     return sd_ctx;
 }
 
 void free_sd_ctx(sd_ctx_t* sd_ctx) {
-    if (sd_ctx->sd != NULL) {
+    if (sd_ctx->sd != nullptr) {
         delete sd_ctx->sd;
-        sd_ctx->sd = NULL;
+        sd_ctx->sd = nullptr;
     }
     free(sd_ctx);
 }
 
 enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx) {
-    if (sd_ctx != NULL && sd_ctx->sd != NULL) {
+    if (sd_ctx != nullptr && sd_ctx->sd != nullptr) {
         SDVersion version = sd_ctx->sd->version;
         if (sd_version_is_dit(version))
             return EULER;
@@ -2084,13 +2074,13 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                     std::vector<sd_image_t*> ref_images,
                                     std::vector<ggml_tensor*> ref_latents,
                                     bool increase_ref_index,
-                                    ggml_tensor* concat_latent = NULL,
-                                    ggml_tensor* denoise_mask  = NULL) {
+                                    ggml_tensor* concat_latent = nullptr,
+                                    ggml_tensor* denoise_mask  = nullptr) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
         // by a third party with a seed <0, let's incorporate randomization here.
-        srand((int)time(NULL));
+        srand((int)time(nullptr));
         seed = rand();
     }
 
@@ -2111,7 +2101,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
 
     // Photo Maker
     std::string prompt_text_only;
-    ggml_tensor* init_img = NULL;
+    ggml_tensor* init_img = nullptr;
     SDCondition id_cond;
     std::vector<bool> class_tokens_mask;
 
@@ -2146,7 +2136,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                 sd_image_f32_t id_image           = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]);
                 sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
                 free(id_image.data);
-                id_image.data = NULL;
+                id_image.data = nullptr;
                 processed_id_images.push_back(processed_id_image);
             }
 
@@ -2157,7 +2147,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
 
             for (auto& image : processed_id_images) {
                 free(image.data);
-                image.data = NULL;
+                image.data = nullptr;
             }
             processed_id_images.clear();
 
@@ -2169,7 +2159,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                                                                                condition_params);
             id_cond                         = std::get<0>(cond_tup);
             class_tokens_mask               = std::get<1>(cond_tup);  //
-            struct ggml_tensor* id_embeds   = NULL;
+            struct ggml_tensor* id_embeds   = nullptr;
             if (pmv2 && pm_params.id_embed_path != nullptr) {
                 id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path);
                 // print_ggml_tensor(id_embeds, true, "id_embeds:");
@@ -2195,7 +2185,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
     }
 
     // Get learned condition
-    t0                               = ggml_time_ms();
     condition_params.text            = prompt;
     condition_params.zero_out_masked = false;
     SDCondition cond                 = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
@@ -2223,8 +2212,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
     }
 
     // Control net hint
-    struct ggml_tensor* image_hint = NULL;
-    if (control_image.data != NULL) {
+    struct ggml_tensor* image_hint = nullptr;
+    if (control_image.data != nullptr) {
         image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
         sd_image_to_tensor(control_image, image_hint);
     }
@@ -2243,8 +2232,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
 
-    struct ggml_tensor* control_latent = NULL;
-    if (sd_version_is_control(sd_ctx->sd->version) && image_hint != NULL) {
+    struct ggml_tensor* control_latent = nullptr;
+    if (sd_version_is_control(sd_ctx->sd->version) && image_hint != nullptr) {
         control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
         ggml_tensor_scale(control_latent, control_strength);
     }
@@ -2282,8 +2271,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
             }
         }
 
-        if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != NULL && sd_ctx->sd->control_net == NULL) {
-            bool no_inpaint = concat_latent == NULL;
+        if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != nullptr && sd_ctx->sd->control_net == nullptr) {
+            bool no_inpaint = concat_latent == nullptr;
             if (no_inpaint) {
                 concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
             }
@@ -2302,33 +2291,33 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                     }
                 }
             }
-        } else if (concat_latent == NULL) {
+        } else if (concat_latent == nullptr) {
             concat_latent = empty_latent;
         }
         cond.c_concat   = concat_latent;
         uncond.c_concat = empty_latent;
-        denoise_mask    = NULL;
+        denoise_mask    = nullptr;
     } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
         auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
         ggml_set_f32(empty_latent, 0);
         uncond.c_concat = empty_latent;
         cond.c_concat   = ref_latents[0];
-        if (cond.c_concat == NULL) {
+        if (cond.c_concat == nullptr) {
             cond.c_concat = empty_latent;
         }
     } else if (sd_version_is_control(sd_ctx->sd->version)) {
         auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
         ggml_set_f32(empty_latent, 0);
         uncond.c_concat = empty_latent;
-        if (sd_ctx->sd->control_net == NULL) {
+        if (sd_ctx->sd->control_net == nullptr) {
             cond.c_concat = control_latent;
         }
-        if (cond.c_concat == NULL) {
+        if (cond.c_concat == nullptr) {
             cond.c_concat = empty_latent;
         }
     }
     SDCondition img_cond;
-    if (uncond.c_crossattn != NULL &&
+    if (uncond.c_crossattn != nullptr &&
         (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
         img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
     }
@@ -2389,7 +2378,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
         t1                      = ggml_time_ms();
         struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
         // print_ggml_tensor(img);
-        if (img != NULL) {
+        if (img != nullptr) {
             decoded_images.push_back(img);
         }
         int64_t t2 = ggml_time_ms();
@@ -2402,9 +2391,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
         sd_ctx->sd->first_stage_model->free_params_buffer();
     }
     sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
-    if (result_images == NULL) {
+    if (result_images == nullptr) {
         ggml_free(work_ctx);
-        return NULL;
+        return nullptr;
     }
 
     for (size_t i = 0; i < decoded_images.size(); i++) {
@@ -2469,35 +2458,35 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                       model_version_to_str[sd_ctx->sd->version],
                       width,
                       height);
-            return NULL;
+            return nullptr;
         }
     } else if (width % 64 || height % 64) {
         LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)",
                   model_version_to_str[sd_ctx->sd->version],
                   width,
                   height);
-        return NULL;
+        return nullptr;
     }
     LOG_DEBUG("generate_image %dx%d", width, height);
-    if (sd_ctx == NULL || sd_img_gen_params == NULL) {
-        return NULL;
+    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
+        return nullptr;
     }
 
     struct ggml_init_params params;
     params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-    params.mem_buffer = NULL;
+    params.mem_buffer = nullptr;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
     struct ggml_context* work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
-        return NULL;
+        return nullptr;
     }
 
     int64_t seed = sd_img_gen_params->seed;
     if (seed < 0) {
-        srand((int)time(NULL));
+        srand((int)time(nullptr));
         seed = rand();
     }
     sd_ctx->sd->rng->manual_seed(seed);
@@ -2509,9 +2498,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     sd_ctx->sd->init_scheduler(sd_img_gen_params->sample_params.scheduler);
     std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
 
-    ggml_tensor* init_latent   = NULL;
-    ggml_tensor* concat_latent = NULL;
-    ggml_tensor* denoise_mask  = NULL;
+    ggml_tensor* init_latent   = nullptr;
+    ggml_tensor* concat_latent = nullptr;
+    ggml_tensor* denoise_mask  = nullptr;
     if (sd_img_gen_params->init_image.data) {
         LOG_INFO("IMG2IMG");
 
@@ -2538,7 +2527,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
             } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
                 mask_channels = 1 + init_latent->ne[2];
             }
-            ggml_tensor* masked_latent = NULL;
+            ggml_tensor* masked_latent = nullptr;
 
             if (sd_ctx->sd->version != VERSION_FLEX_2) {
                 // most inpaint models mask before vae
@@ -2635,14 +2624,20 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     std::vector<ggml_tensor*> ref_latents;
     for (int i = 0; i < ref_images.size(); i++) {
         ggml_tensor* img;
-        if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
+        if (sd_img_gen_params->auto_resize_ref_image) {
+            LOG_DEBUG("auto resize ref images");
             sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
             int VAE_IMAGE_SIZE       = std::min(1024 * 1024, width * height);
             double vae_width         = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
             double vae_height        = vae_width * ref_image.height / ref_image.width;
 
-            vae_height = round(vae_height / 32) * 32;
-            vae_width  = round(vae_width / 32) * 32;
+            int factor = 16;
+            if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
+                factor = 32;
+            }
+
+            vae_height = round(vae_height / factor) * factor;
+            vae_width  = round(vae_width / factor) * factor;
 
             sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
             free(ref_image.data);
@@ -2675,7 +2670,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
         ref_latents.push_back(latent);
     }
 
-    if (sd_img_gen_params->init_image.data != NULL || sd_img_gen_params->ref_images_count > 0) {
+    if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) {
         size_t t1 = ggml_time_ms();
         LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
     }
@@ -2717,8 +2712,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
 }
 
 SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) {
-    if (sd_ctx == NULL || sd_vid_gen_params == NULL) {
-        return NULL;
+    if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) {
+        return nullptr;
     }
 
     std::string prompt          = SAFE_STR(sd_vid_gen_params->prompt);
@@ -2755,24 +2750,23 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
             }
         }
         LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
-        sample_steps = total_steps - high_noise_sample_steps;
     }
 
     struct ggml_init_params params;
     params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-    params.mem_buffer = NULL;
+    params.mem_buffer = nullptr;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
     struct ggml_context* work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
-        return NULL;
+        return nullptr;
     }
 
     int64_t seed = sd_vid_gen_params->seed;
     if (seed < 0) {
-        seed = (int)time(NULL);
+        seed = (int)time(nullptr);
     }
 
     sd_ctx->sd->rng->manual_seed(seed);
@@ -2782,11 +2776,11 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     // Apply lora
     prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
 
-    ggml_tensor* init_latent        = NULL;
-    ggml_tensor* clip_vision_output = NULL;
-    ggml_tensor* concat_latent      = NULL;
-    ggml_tensor* denoise_mask       = NULL;
-    ggml_tensor* vace_context       = NULL;
+    ggml_tensor* init_latent        = nullptr;
+    ggml_tensor* clip_vision_output = nullptr;
+    ggml_tensor* concat_latent      = nullptr;
+    ggml_tensor* denoise_mask       = nullptr;
+    ggml_tensor* vace_context       = nullptr;
     int64_t ref_image_num           = 0;  // for vace
     if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
         sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
@@ -2802,7 +2796,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
             }
 
             if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
-                ggml_tensor* end_image_clip_vision_output = NULL;
+                ggml_tensor* end_image_clip_vision_output = nullptr;
                 if (sd_vid_gen_params->end_image.data) {
                     end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2);
                 } else {
@@ -2883,7 +2877,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") {
         LOG_INFO("VACE");
         int64_t t1                    = ggml_time_ms();
-        ggml_tensor* ref_image_latent = NULL;
+        ggml_tensor* ref_image_latent = nullptr;
         if (sd_vid_gen_params->init_image.data) {
             ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
             sd_image_to_tensor(sd_vid_gen_params->init_image, ref_img);
@@ -2956,7 +2950,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
     }
 
-    if (init_latent == NULL) {
+    if (init_latent == nullptr) {
         init_latent = generate_init_latent(sd_ctx, work_ctx, width, height, frames, true);
     }
 
@@ -3019,7 +3013,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                  cond,
                                  uncond,
                                  {},
-                                 NULL,
+                                 nullptr,
                                  0,
                                  sd_vid_gen_params->high_noise_sample_params.guidance,
                                  sd_vid_gen_params->high_noise_sample_params.eta,
@@ -3039,7 +3033,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         if (sd_ctx->sd->free_params_immediately) {
             sd_ctx->sd->high_noise_diffusion_model->free_params_buffer();
         }
-        noise = NULL;
+        noise = nullptr;
     }
 
     // Sample
@@ -3055,7 +3049,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                           cond,
                                           uncond,
                                           {},
-                                          NULL,
+                                          nullptr,
                                           0,
                                           sd_vid_gen_params->sample_params.guidance,
                                           sd_vid_gen_params->sample_params.eta,
@@ -3101,9 +3095,9 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     }
 
     sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t));
-    if (result_images == NULL) {
+    if (result_images == nullptr) {
         ggml_free(work_ctx);
-        return NULL;
+        return nullptr;
     }
     *num_frames_out = vid->ne[2];
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a891a58f..59a25cdc 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -216,6 +216,7 @@ typedef struct {
     sd_image_t init_image;
     sd_image_t* ref_images;
     int ref_images_count;
+    bool auto_resize_ref_image;
     bool increase_ref_index;
     sd_image_t mask_image;
     int width;
@@ -292,7 +293,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                         bool offload_params_to_cpu,
                                         bool direct,
-                                        int n_threads);
+                                        int n_threads,
+                                        int tile_size);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
diff --git a/t5.hpp b/t5.hpp
index 15f7af80..1067a050 100644
--- a/t5.hpp
+++ b/t5.hpp
@@ -1,7 +1,7 @@
 #ifndef __T5_HPP__
 #define __T5_HPP__
 
-#include <float.h>
+#include <cfloat>
 #include <limits>
 #include <map>
 #include <memory>
@@ -461,7 +461,7 @@ class T5LayerNorm : public UnaryBlock {
     int64_t hidden_size;
     float eps;
 
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
         enum ggml_type wtype = GGML_TYPE_F32;
         params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
     }
@@ -472,7 +472,7 @@ class T5LayerNorm : public UnaryBlock {
         : hidden_size(hidden_size),
           eps(eps) {}
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         struct ggml_tensor* w = params["weight"];
         x                     = ggml_rms_norm(ctx, x, eps);
         x                     = ggml_mul(ctx, x, w);
@@ -487,7 +487,7 @@ struct T5DenseActDense : public UnaryBlock {
         blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [N, n_token, model_dim]
         auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
         auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
@@ -509,7 +509,7 @@ struct T5DenseGatedActDense : public UnaryBlock {
         blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [N, n_token, model_dim]
         auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
         auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
@@ -530,7 +530,7 @@ struct T5LayerFF : public UnaryBlock {
         blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [N, n_token, model_dim]
         auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
         auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
@@ -582,9 +582,9 @@ class T5Attention : public GGMLBlock {
     std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
                                                                 ggml_backend_t backend,
                                                                 struct ggml_tensor* x,
-                                                                struct ggml_tensor* past_bias                = NULL,
-                                                                struct ggml_tensor* mask                     = NULL,
-                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+                                                                struct ggml_tensor* past_bias                = nullptr,
+                                                                struct ggml_tensor* mask                     = nullptr,
+                                                                struct ggml_tensor* relative_position_bucket = nullptr) {
         auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
         auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
         auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
@@ -597,11 +597,11 @@ class T5Attention : public GGMLBlock {
         auto k = k_proj->forward(ctx, x);
         auto v = v_proj->forward(ctx, x);
 
-        if (using_relative_attention_bias && relative_position_bucket != NULL) {
+        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
             past_bias = compute_bias(ctx, relative_position_bucket);
         }
-        if (past_bias != NULL) {
-            if (mask != NULL) {
+        if (past_bias != nullptr) {
+            if (mask != nullptr) {
                 mask = ggml_repeat(ctx, mask, past_bias);
                 mask = ggml_add(ctx, mask, past_bias);
             } else {
@@ -632,9 +632,9 @@ struct T5LayerSelfAttention : public GGMLBlock {
     std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
                                                                 ggml_backend_t backend,
                                                                 struct ggml_tensor* x,
-                                                                struct ggml_tensor* past_bias                = NULL,
-                                                                struct ggml_tensor* mask                     = NULL,
-                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+                                                                struct ggml_tensor* past_bias                = nullptr,
+                                                                struct ggml_tensor* mask                     = nullptr,
+                                                                struct ggml_tensor* relative_position_bucket = nullptr) {
         // x: [N, n_token, model_dim]
         auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
         auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
@@ -659,9 +659,9 @@ struct T5Block : public GGMLBlock {
     std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
                                                                 ggml_backend_t backend,
                                                                 struct ggml_tensor* x,
-                                                                struct ggml_tensor* past_bias                = NULL,
-                                                                struct ggml_tensor* mask                     = NULL,
-                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+                                                                struct ggml_tensor* past_bias                = nullptr,
+                                                                struct ggml_tensor* mask                     = nullptr,
+                                                                struct ggml_tensor* relative_position_bucket = nullptr) {
         // x: [N, n_token, model_dim]
         auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
         auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
@@ -695,9 +695,9 @@ struct T5Stack : public GGMLBlock {
     struct ggml_tensor* forward(struct ggml_context* ctx,
                                 ggml_backend_t backend,
                                 struct ggml_tensor* x,
-                                struct ggml_tensor* past_bias                = NULL,
-                                struct ggml_tensor* attention_mask           = NULL,
-                                struct ggml_tensor* relative_position_bucket = NULL) {
+                                struct ggml_tensor* past_bias                = nullptr,
+                                struct ggml_tensor* attention_mask           = nullptr,
+                                struct ggml_tensor* relative_position_bucket = nullptr) {
         // x: [N, n_token, model_dim]
         for (int i = 0; i < num_layers; i++) {
             auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
@@ -743,9 +743,9 @@ struct T5 : public GGMLBlock {
     struct ggml_tensor* forward(struct ggml_context* ctx,
                                 ggml_backend_t backend,
                                 struct ggml_tensor* input_ids,
-                                struct ggml_tensor* past_bias                = NULL,
-                                struct ggml_tensor* attention_mask           = NULL,
-                                struct ggml_tensor* relative_position_bucket = NULL) {
+                                struct ggml_tensor* past_bias                = nullptr,
+                                struct ggml_tensor* attention_mask           = nullptr,
+                                struct ggml_tensor* relative_position_bucket = nullptr) {
         // input_ids: [N, n_token]
 
         auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
@@ -776,7 +776,7 @@ struct T5Runner : public GGMLRunner {
         model.init(params_ctx, tensor_types, prefix);
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "t5";
     }
 
@@ -788,16 +788,16 @@ struct T5Runner : public GGMLRunner {
                                 ggml_backend_t backend,
                                 struct ggml_tensor* input_ids,
                                 struct ggml_tensor* relative_position_bucket,
-                                struct ggml_tensor* attention_mask = NULL) {
+                                struct ggml_tensor* attention_mask = nullptr) {
         size_t N       = input_ids->ne[1];
         size_t n_token = input_ids->ne[0];
 
-        auto hidden_states = model.forward(ctx, backend, input_ids, NULL, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
+        auto hidden_states = model.forward(ctx, backend, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
         return hidden_states;
     }
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* attention_mask = NULL) {
+                                    struct ggml_tensor* attention_mask = nullptr) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
         input_ids      = to_backend(input_ids);
@@ -829,7 +829,7 @@ struct T5Runner : public GGMLRunner {
                  struct ggml_tensor* input_ids,
                  struct ggml_tensor* attention_mask,
                  ggml_tensor** output,
-                 ggml_context* output_ctx = NULL) {
+                 ggml_context* output_ctx = nullptr) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
             return build_graph(input_ids, attention_mask);
         };
@@ -968,11 +968,11 @@ struct T5Embedder {
     void test() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
         struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != NULL);
+        GGML_ASSERT(work_ctx != nullptr);
 
         {
             std::string text("a lovely cat");
@@ -987,7 +987,7 @@ struct T5Embedder {
             printf("\n");
             auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
             auto attention_mask     = vector_to_ggml_tensor(work_ctx, masks);
-            struct ggml_tensor* out = NULL;
+            struct ggml_tensor* out = nullptr;
 
             int t0 = ggml_time_ms();
             model.compute(8, input_ids, attention_mask, &out, work_ctx);
@@ -1022,7 +1022,7 @@ struct T5Embedder {
             }
         }
 
-        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, false, tensor_types, "", true));
+        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_types, "", true);
 
         t5->alloc_params_buffer();
         std::map<std::string, ggml_tensor*> tensors;
diff --git a/tae.hpp b/tae.hpp
index 41bcbe2f..d630325d 100644
--- a/tae.hpp
+++ b/tae.hpp
@@ -29,7 +29,7 @@ class TAEBlock : public UnaryBlock {
         }
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [n, n_in, h, w]
         // return: [n, n_out, h, w]
 
@@ -86,7 +86,7 @@ class TinyEncoder : public UnaryBlock {
         blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [n, in_channels, h, w]
         // return: [n, z_channels, h/8, w/8]
 
@@ -136,7 +136,7 @@ class TinyDecoder : public UnaryBlock {
         blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) override {
         // z: [n, z_channels, h, w]
         // return: [n, out_channels, h*8, w*8]
 
@@ -218,7 +218,7 @@ struct TinyAutoEncoder : public GGMLRunner {
         }
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "taesd";
     }
 
@@ -261,7 +261,7 @@ struct TinyAutoEncoder : public GGMLRunner {
                  struct ggml_tensor* z,
                  bool decode_graph,
                  struct ggml_tensor** output,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_context* output_ctx = nullptr) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
             return build_graph(z, decode_graph);
         };
diff --git a/unet.hpp b/unet.hpp
index 19bedb32..7022a7c9 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -384,8 +384,8 @@ class UnetModelBlock : public GGMLBlock {
                                 struct ggml_tensor* x,
                                 struct ggml_tensor* timesteps,
                                 struct ggml_tensor* context,
-                                struct ggml_tensor* c_concat              = NULL,
-                                struct ggml_tensor* y                     = NULL,
+                                struct ggml_tensor* c_concat              = nullptr,
+                                struct ggml_tensor* y                     = nullptr,
                                 int num_video_frames                      = -1,
                                 std::vector<struct ggml_tensor*> controls = {},
                                 float control_strength                    = 0.f) {
@@ -395,20 +395,20 @@ class UnetModelBlock : public GGMLBlock {
         // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
         // y: [N, adm_in_channels] or [1, adm_in_channels]
         // return: [N, out_channels, h, w]
-        if (context != NULL) {
+        if (context != nullptr) {
             if (context->ne[2] != x->ne[3]) {
                 context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
             }
         }
 
-        if (c_concat != NULL) {
+        if (c_concat != nullptr) {
             if (c_concat->ne[3] != x->ne[3]) {
                 c_concat = ggml_repeat(ctx, c_concat, x);
             }
             x = ggml_concat(ctx, x, c_concat, 2);
         }
 
-        if (y != NULL) {
+        if (y != nullptr) {
             if (y->ne[1] != x->ne[3]) {
                 y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
             }
@@ -428,7 +428,7 @@ class UnetModelBlock : public GGMLBlock {
         emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]
 
         // SDXL/SVD
-        if (y != NULL) {
+        if (y != nullptr) {
             auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
             auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
 
@@ -562,7 +562,7 @@ struct UNetModelRunner : public GGMLRunner {
         }
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "unet";
     }
 
@@ -573,8 +573,8 @@ struct UNetModelRunner : public GGMLRunner {
     struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                     struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
-                                    struct ggml_tensor* c_concat              = NULL,
-                                    struct ggml_tensor* y                     = NULL,
+                                    struct ggml_tensor* c_concat              = nullptr,
+                                    struct ggml_tensor* y                     = nullptr,
                                     int num_video_frames                      = -1,
                                     std::vector<struct ggml_tensor*> controls = {},
                                     float control_strength                    = 0.f) {
@@ -619,8 +619,8 @@ struct UNetModelRunner : public GGMLRunner {
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
-                 struct ggml_tensor** output               = NULL,
-                 struct ggml_context* output_ctx           = NULL) {
+                 struct ggml_tensor** output               = nullptr,
+                 struct ggml_context* output_ctx           = nullptr) {
         // x: [N, in_channels, h, w]
         // timesteps: [N, ]
         // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
@@ -636,11 +636,11 @@ struct UNetModelRunner : public GGMLRunner {
     void test() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
         struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != NULL);
+        GGML_ASSERT(work_ctx != nullptr);
 
         {
             // CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
@@ -663,10 +663,10 @@ struct UNetModelRunner : public GGMLRunner {
             ggml_set_f32(y, 0.5f);
             // print_ggml_tensor(y);
 
-            struct ggml_tensor* out = NULL;
+            struct ggml_tensor* out = nullptr;
 
             int t0 = ggml_time_ms();
-            compute(8, x, timesteps, context, NULL, y, num_video_frames, {}, 0.f, &out, work_ctx);
+            compute(8, x, timesteps, context, nullptr, y, num_video_frames, {}, 0.f, &out, work_ctx);
             int t1 = ggml_time_ms();
 
             print_ggml_tensor(out);
diff --git a/upscaler.cpp b/upscaler.cpp
index d3042372..68eb50ef 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -4,17 +4,20 @@
 #include "stable-diffusion.h"
 
 struct UpscalerGGML {
-    ggml_backend_t backend    = NULL;  // general backend
+    ggml_backend_t backend    = nullptr;  // general backend
     ggml_type model_data_type = GGML_TYPE_F16;
     std::shared_ptr<ESRGAN> esrgan_upscaler;
     std::string esrgan_path;
     int n_threads;
     bool direct = false;
+    int tile_size = 128;
 
     UpscalerGGML(int n_threads,
-                 bool direct = false)
+                 bool direct = false,
+                 int tile_size = 128)
         : n_threads(n_threads),
-          direct(direct) {
+          direct(direct),
+          tile_size(tile_size) {
     }
 
     bool load_from_file(const std::string& esrgan_path,
@@ -51,7 +54,7 @@ struct UpscalerGGML {
             backend = ggml_backend_cpu_init();
         }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.tensor_storages_types);
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.tensor_storages_types);
         if (direct) {
             esrgan_upscaler->enable_conv2d_direct();
         }
@@ -63,7 +66,7 @@ struct UpscalerGGML {
 
     sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
         // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
-        sd_image_t upscaled_image = {0, 0, 0, NULL};
+        sd_image_t upscaled_image = {0, 0, 0, nullptr};
         int output_width          = (int)input_image.width * esrgan_upscaler->scale;
         int output_height         = (int)input_image.height * esrgan_upscaler->scale;
         LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
@@ -71,7 +74,7 @@ struct UpscalerGGML {
 
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
         // draft context
@@ -107,29 +110,30 @@ struct UpscalerGGML {
 };
 
 struct upscaler_ctx_t {
-    UpscalerGGML* upscaler = NULL;
+    UpscalerGGML* upscaler = nullptr;
 };
 
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                  bool offload_params_to_cpu,
                                  bool direct,
-                                 int n_threads) {
+                                 int n_threads,
+                                 int tile_size) {
     upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
-    if (upscaler_ctx == NULL) {
-        return NULL;
+    if (upscaler_ctx == nullptr) {
+        return nullptr;
     }
     std::string esrgan_path(esrgan_path_c_str);
 
-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
-    if (upscaler_ctx->upscaler == NULL) {
-        return NULL;
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
+    if (upscaler_ctx->upscaler == nullptr) {
+        return nullptr;
     }
 
     if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
         delete upscaler_ctx->upscaler;
-        upscaler_ctx->upscaler = NULL;
+        upscaler_ctx->upscaler = nullptr;
         free(upscaler_ctx);
-        return NULL;
+        return nullptr;
     }
     return upscaler_ctx;
 }
@@ -139,16 +143,16 @@ sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_
 }
 
 int get_upscale_factor(upscaler_ctx_t* upscaler_ctx) {
-    if (upscaler_ctx == NULL || upscaler_ctx->upscaler == NULL || upscaler_ctx->upscaler->esrgan_upscaler == NULL) {
+    if (upscaler_ctx == nullptr || upscaler_ctx->upscaler == nullptr || upscaler_ctx->upscaler->esrgan_upscaler == nullptr) {
         return 1;
     }
     return upscaler_ctx->upscaler->esrgan_upscaler->scale;
 }
 
 void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx) {
-    if (upscaler_ctx->upscaler != NULL) {
+    if (upscaler_ctx->upscaler != nullptr) {
         delete upscaler_ctx->upscaler;
-        upscaler_ctx->upscaler = NULL;
+        upscaler_ctx->upscaler = nullptr;
     }
     free(upscaler_ctx);
 }
diff --git a/util.cpp b/util.cpp
index 1d0bbd2b..d6d06752 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1,8 +1,8 @@
 #include "util.h"
-#include <stdarg.h>
 #include <algorithm>
 #include <cmath>
 #include <codecvt>
+#include <cstdarg>
 #include <fstream>
 #include <locale>
 #include <sstream>
@@ -64,7 +64,7 @@ std::string format(const char* fmt, ...) {
     va_list ap2;
     va_start(ap, fmt);
     va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
+    int size = vsnprintf(nullptr, 0, fmt, ap);
     std::vector<char> buf(size + 1);
     int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
     va_end(ap2);
@@ -170,11 +170,11 @@ int32_t get_num_physical_cores() {
 #elif defined(__APPLE__) && defined(__MACH__)
     int32_t num_physical_cores;
     size_t len = sizeof(num_physical_cores);
-    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, nullptr, 0);
     if (result == 0) {
         return num_physical_cores;
     }
-    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, nullptr, 0);
     if (result == 0) {
         return num_physical_cores;
     }
@@ -185,8 +185,8 @@ int32_t get_num_physical_cores() {
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
-static sd_progress_cb_t sd_progress_cb = NULL;
-void* sd_progress_cb_data              = NULL;
+static sd_progress_cb_t sd_progress_cb = nullptr;
+void* sd_progress_cb_data              = nullptr;
 
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
@@ -296,8 +296,8 @@ std::string trim(const std::string& s) {
     return rtrim(ltrim(s));
 }
 
-static sd_log_cb_t sd_log_cb = NULL;
-void* sd_log_cb_data         = NULL;
+static sd_log_cb_t sd_log_cb = nullptr;
+void* sd_log_cb_data         = nullptr;
 
 #define LOG_BUFFER_SIZE 4096
 
diff --git a/vae.hpp b/vae.hpp
index 20d97a2a..455edae0 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -30,7 +30,7 @@ class ResnetBlock : public UnaryBlock {
         }
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [N, in_channels, h, w]
         // t_emb is always None
         auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
@@ -76,7 +76,7 @@ class AttnBlock : public UnaryBlock {
         blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [N, in_channels, h, w]
         auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
         auto q_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["q"]);
@@ -134,7 +134,7 @@ class AE3DConv : public Conv2d {
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx,
-                                struct ggml_tensor* x) {
+                                struct ggml_tensor* x) override {
         // timesteps always None
         // skip_video always False
         // x: [N, IC, IH, IW]
@@ -163,7 +163,7 @@ class AE3DConv : public Conv2d {
 
 class VideoResnetBlock : public ResnetBlock {
 protected:
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
         enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_types, GGML_TYPE_F32);
         params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
     }
@@ -182,7 +182,7 @@ class VideoResnetBlock : public ResnetBlock {
         blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
         // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
         // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
         // t_emb is always None
@@ -548,7 +548,7 @@ struct AutoEncoderKL : public VAE {
         ae.init(params_ctx, tensor_types, prefix);
     }
 
-    void enable_conv2d_direct() {
+    void enable_conv2d_direct() override {
         std::vector<GGMLBlock*> blocks;
         ae.get_all_blocks(blocks);
         for (auto block : blocks) {
@@ -559,7 +559,7 @@ struct AutoEncoderKL : public VAE {
         }
     }
 
-    void set_conv2d_scale(float scale) {
+    void set_conv2d_scale(float scale) override {
         std::vector<GGMLBlock*> blocks;
         ae.get_all_blocks(blocks);
         for (auto block : blocks) {
@@ -570,11 +570,11 @@ struct AutoEncoderKL : public VAE {
         }
     }
 
-    std::string get_desc() {
+    std::string get_desc() override {
         return "vae";
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {
         ae.get_param_tensors(tensors, prefix);
     }
 
@@ -594,7 +594,7 @@ struct AutoEncoderKL : public VAE {
                  struct ggml_tensor* z,
                  bool decode_graph,
                  struct ggml_tensor** output,
-                 struct ggml_context* output_ctx = NULL) {
+                 struct ggml_context* output_ctx = nullptr) override {
         GGML_ASSERT(!decode_only || decode_graph);
         auto get_graph = [&]() -> struct ggml_cgraph* {
             return build_graph(z, decode_graph);
@@ -607,11 +607,11 @@ struct AutoEncoderKL : public VAE {
     void test() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;
         params.no_alloc   = false;
 
         struct ggml_context* work_ctx = ggml_init(params);
-        GGML_ASSERT(work_ctx != NULL);
+        GGML_ASSERT(work_ctx != nullptr);
 
         {
             // CPU, x{1, 3, 64, 64}: Pass
@@ -621,7 +621,7 @@ struct AutoEncoderKL : public VAE {
             auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2);
             ggml_set_f32(x, 0.5f);
             print_ggml_tensor(x);
-            struct ggml_tensor* out = NULL;
+            struct ggml_tensor* out = nullptr;
 
             int t0 = ggml_time_ms();
             compute(8, x, false, &out, work_ctx);
@@ -639,7 +639,7 @@ struct AutoEncoderKL : public VAE {
             auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
             ggml_set_f32(z, 0.5f);
             print_ggml_tensor(z);
-            struct ggml_tensor* out = NULL;
+            struct ggml_tensor* out = nullptr;
 
             int t0 = ggml_time_ms();
             compute(8, z, true, &out, work_ctx);
diff --git a/wan.hpp b/wan.hpp
index 31fa90b3..b6a08fde 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -2,6 +2,8 @@
 #define __WAN_HPP__
 
 #include <map>
+#include <memory>
+#include <utility>
 
 #include "common.hpp"
 #include "flux.hpp"
@@ -24,7 +26,7 @@ namespace WAN {
         std::tuple<int, int, int> dilation;
         bool bias;
 
-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
             params["weight"] = ggml_new_tensor_4d(ctx,
                                                   GGML_TYPE_F16,
                                                   std::get<2>(kernel_size),
@@ -46,17 +48,17 @@ namespace WAN {
                      bool bias                          = true)
             : in_channels(in_channels),
               out_channels(out_channels),
-              kernel_size(kernel_size),
-              stride(stride),
-              padding(padding),
-              dilation(dilation),
+              kernel_size(std::move(kernel_size)),
+              stride(std::move(stride)),
+              padding(std::move(padding)),
+              dilation(std::move(dilation)),
               bias(bias) {}
 
-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* cache_x = NULL) {
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* cache_x = nullptr) {
             // x: [N*IC, ID, IH, IW]
             // result: x: [N*OC, ID, IH, IW]
             struct ggml_tensor* w = params["weight"];
-            struct ggml_tensor* b = NULL;
+            struct ggml_tensor* b = nullptr;
             if (bias) {
                 b = params["bias"];
             }
@@ -68,7 +70,7 @@ namespace WAN {
             int lp2 = 2 * std::get<0>(padding);
             int rp2 = 0;
 
-            if (cache_x != NULL && lp2 > 0) {
+            if (cache_x != nullptr && lp2 > 0) {
                 x = ggml_concat(ctx, cache_x, x, 2);
                 lp2 -= (int)cache_x->ne[2];
             }
@@ -85,7 +87,7 @@ namespace WAN {
     protected:
         int64_t dim;
 
-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
             ggml_type wtype = GGML_TYPE_F32;
             params["gamma"] = ggml_new_tensor_1d(ctx, wtype, dim);
         }
@@ -94,7 +96,7 @@ namespace WAN {
         RMS_norm(int64_t dim)
             : dim(dim) {}
 
-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
             // x: [N*IC, ID, IH, IW], IC == dim
             // assert N == 1
 
@@ -159,12 +161,12 @@ namespace WAN {
                     int idx = feat_idx;
                     feat_idx += 1;
                     if (chunk_idx == 0) {
-                        // feat_cache[idx] == NULL, pass
+                        // feat_cache[idx] == nullptr, pass
                     } else {
                         auto time_conv = std::dynamic_pointer_cast<CausalConv3d>(blocks["time_conv"]);
 
                         auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]);
-                        if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) {  // chunk_idx >= 2
+                        if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) {  // chunk_idx >= 2
                             // cache last frame of last two chunk
                             cache_x = ggml_concat(ctx,
                                                   ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]),
@@ -209,7 +211,7 @@ namespace WAN {
             if (mode == "downsample3d") {
                 if (feat_cache.size() > 0) {
                     int idx = feat_idx;
-                    if (feat_cache[idx] == NULL) {
+                    if (feat_cache[idx] == nullptr) {
                         feat_cache[idx] = x;
                         feat_idx += 1;
                     } else {
@@ -373,7 +375,7 @@ namespace WAN {
                     if (feat_cache.size() > 0) {
                         int idx      = feat_idx;
                         auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]);
-                        if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) {
+                        if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) {
                             // cache last frame of last two chunk
                             cache_x = ggml_concat(ctx,
                                                   ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]),
@@ -566,7 +568,7 @@ namespace WAN {
 
             x = ggml_nn_attention(ctx, q, k, v, false);  // [t, h * w, c]
             // v      = ggml_cont(ctx, ggml_torch_permute(ctx, v, 1, 0, 2, 3));  // [t, h * w, c]
-            // x = ggml_nn_attention_ext(ctx, q, k, v, q->ne[2], NULL, false, false, true);
+            // x = ggml_nn_attention_ext(ctx, q, k, v, q->ne[2], nullptr, false, false, true);
 
             x = ggml_nn_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [t, c, h * w]
             x = ggml_reshape_4d(ctx, x, w, h, c, n);                  // [t, c, h, w]
@@ -672,7 +674,7 @@ namespace WAN {
             if (feat_cache.size() > 0) {
                 int idx      = feat_idx;
                 auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]);
-                if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) {
+                if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) {
                     // cache last frame of last two chunk
                     cache_x = ggml_concat(ctx,
                                           ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]),
@@ -724,7 +726,7 @@ namespace WAN {
             if (feat_cache.size() > 0) {
                 int idx      = feat_idx;
                 auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]);
-                if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) {
+                if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) {
                     // cache last frame of last two chunk
                     cache_x = ggml_concat(ctx,
                                           ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]),
@@ -843,7 +845,7 @@ namespace WAN {
             if (feat_cache.size() > 0) {
                 int idx      = feat_idx;
                 auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]);
-                if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) {
+                if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) {
                     // cache last frame of last two chunk
                     cache_x = ggml_concat(ctx,
                                           ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]),
@@ -895,7 +897,7 @@ namespace WAN {
             if (feat_cache.size() > 0) {
                 int idx      = feat_idx;
                 auto cache_x = ggml_slice(ctx, x, 2, -CACHE_T, x->ne[2]);
-                if (cache_x->ne[2] < 2 && feat_cache[idx] != NULL) {
+                if (cache_x->ne[2] < 2 && feat_cache[idx] != nullptr) {
                     // cache last frame of last two chunk
                     cache_x = ggml_concat(ctx,
                                           ggml_slice(ctx, feat_cache[idx], 2, -1, feat_cache[idx]->ne[2]),
@@ -935,9 +937,9 @@ namespace WAN {
 
         void clear_cache() {
             _conv_idx     = 0;
-            _feat_map     = std::vector<struct ggml_tensor*>(_conv_num, NULL);
+            _feat_map     = std::vector<struct ggml_tensor*>(_conv_num, nullptr);
             _enc_conv_idx = 0;
-            _enc_feat_map = std::vector<struct ggml_tensor*>(_enc_conv_num, NULL);
+            _enc_feat_map = std::vector<struct ggml_tensor*>(_enc_conv_num, nullptr);
         }
 
     public:
@@ -1116,11 +1118,11 @@ namespace WAN {
             ae.init(params_ctx, tensor_types, prefix);
         }
 
-        std::string get_desc() {
+        std::string get_desc() override {
             return "wan_vae";
         }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {
             ae.get_param_tensors(tensors, prefix);
         }
 
@@ -1152,7 +1154,7 @@ namespace WAN {
 
             for (int64_t feat_idx = 0; feat_idx < ae._feat_map.size(); feat_idx++) {
                 ggml_tensor* feat_cache = ae._feat_map[feat_idx];
-                if (feat_cache != NULL) {
+                if (feat_cache != nullptr) {
                     cache("feat_idx:" + std::to_string(feat_idx), feat_cache);
                     ggml_build_forward_expand(gf, feat_cache);
                 }
@@ -1167,7 +1169,7 @@ namespace WAN {
                      struct ggml_tensor* z,
                      bool decode_graph,
                      struct ggml_tensor** output,
-                     struct ggml_context* output_ctx = NULL) {
+                     struct ggml_context* output_ctx = nullptr) override {
             if (true) {
                 auto get_graph = [&]() -> struct ggml_cgraph* {
                     return build_graph(z, decode_graph);
@@ -1180,7 +1182,7 @@ namespace WAN {
                 auto get_graph = [&]() -> struct ggml_cgraph* {
                     return build_graph_partial(z, decode_graph, i);
                 };
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
                 GGMLRunner::compute(get_graph, n_threads, true, &out, output_ctx);
                 ae.clear_cache();
                 if (t == 1) {
@@ -1220,11 +1222,11 @@ namespace WAN {
         void test() {
             struct ggml_init_params params;
             params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
-            params.mem_buffer = NULL;
+            params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
             struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != NULL);
+            GGML_ASSERT(work_ctx != nullptr);
 
             if (true) {
                 // cpu f32, pass
@@ -1235,7 +1237,7 @@ namespace WAN {
                 ggml_set_f32(z, 0.5f);
                 z = load_tensor_from_file(work_ctx, "wan_vae_z.bin");
                 print_ggml_tensor(z);
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
 
                 int64_t t0 = ggml_time_ms();
                 compute(8, z, true, &out, work_ctx);
@@ -1250,7 +1252,7 @@ namespace WAN {
             // ggml_backend_t backend = ggml_backend_cuda_init(0);
             ggml_backend_t backend            = ggml_backend_cpu_init();
             ggml_type model_data_type         = GGML_TYPE_F16;
-            std::shared_ptr<WanVAERunner> vae = std::shared_ptr<WanVAERunner>(new WanVAERunner(backend, false, {}, "", false, VERSION_WAN2_2_TI2V));
+            std::shared_ptr<WanVAERunner> vae = std::make_shared<WanVAERunner>(backend, false, String2GGMLType{}, "", false, VERSION_WAN2_2_TI2V);
             {
                 LOG_INFO("loading from '%s'", file_path.c_str());
 
@@ -1309,7 +1311,7 @@ namespace WAN {
                                             ggml_backend_t backend,
                                             struct ggml_tensor* x,
                                             struct ggml_tensor* pe,
-                                            struct ggml_tensor* mask = NULL) {
+                                            struct ggml_tensor* mask = nullptr) {
             // x: [N, n_token, dim]
             // pe: [n_token, d_head/2, 2, 2]
             // return [N, n_token, dim]
@@ -1367,7 +1369,7 @@ namespace WAN {
                                     ggml_backend_t backend,
                                     struct ggml_tensor* x,
                                     struct ggml_tensor* context,
-                                    int64_t context_img_len) {
+                                    int64_t context_img_len) override {
             // x: [N, n_token, dim]
             // context: [N, n_context, dim]
             // context_img_len: unused
@@ -1388,7 +1390,7 @@ namespace WAN {
             k      = norm_k->forward(ctx, k);
             auto v = v_proj->forward(ctx, context);  // [N, n_context, dim]
 
-            x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
+            x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, nullptr, false, false, flash_attn);  // [N, n_token, dim]
 
             x = o_proj->forward(ctx, x);  // [N, n_token, dim]
             return x;
@@ -1417,7 +1419,7 @@ namespace WAN {
                                     ggml_backend_t backend,
                                     struct ggml_tensor* x,
                                     struct ggml_tensor* context,
-                                    int64_t context_img_len) {
+                                    int64_t context_img_len) override {
             // x: [N, n_token, dim]
             // context: [N, context_img_len + context_txt_len, dim]
             // return [N, n_token, dim]
@@ -1455,8 +1457,8 @@ namespace WAN {
             k_img      = norm_k_img->forward(ctx, k_img);
             auto v_img = v_img_proj->forward(ctx, context_img);  // [N, context_img_len, dim]
 
-            auto img_x = ggml_nn_attention_ext(ctx, backend, q, k_img, v_img, num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
-            x          = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, NULL, false, false, flash_attn);          // [N, n_token, dim]
+            auto img_x = ggml_nn_attention_ext(ctx, backend, q, k_img, v_img, num_heads, nullptr, false, false, flash_attn);  // [N, n_token, dim]
+            x          = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, nullptr, false, false, flash_attn);          // [N, n_token, dim]
 
             x = ggml_add(ctx, x, img_x);
 
@@ -1497,7 +1499,7 @@ namespace WAN {
     protected:
         int dim;
 
-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
             enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
             params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 6, 1);
         }
@@ -1587,7 +1589,7 @@ namespace WAN {
     class VaceWanAttentionBlock : public WanAttentionBlock {
     protected:
         int block_id;
-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
             enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
             params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 6, 1);
         }
@@ -1641,7 +1643,7 @@ namespace WAN {
     protected:
         int dim;
 
-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
             enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
             params["modulation"] = ggml_new_tensor_3d(ctx, wtype, dim, 2, 1);
         }
@@ -1688,7 +1690,7 @@ namespace WAN {
         int in_dim;
         int flf_pos_embed_token_number;
 
-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
             if (flf_pos_embed_token_number > 0) {
                 params["emb_pos"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, in_dim, flf_pos_embed_token_number, 1);
             }
@@ -1876,8 +1878,8 @@ namespace WAN {
                                          struct ggml_tensor* timestep,
                                          struct ggml_tensor* context,
                                          struct ggml_tensor* pe,
-                                         struct ggml_tensor* clip_fea     = NULL,
-                                         struct ggml_tensor* vace_context = NULL,
+                                         struct ggml_tensor* clip_fea     = nullptr,
+                                         struct ggml_tensor* vace_context = nullptr,
                                          float vace_strength              = 1.f,
                                          int64_t N                        = 1) {
             // x: [N*C, T, H, W], C => in_dim
@@ -1920,7 +1922,7 @@ namespace WAN {
             context = text_embedding_2->forward(ctx, context);  // [N, context_txt_len, dim]
 
             int64_t context_img_len = 0;
-            if (clip_fea != NULL) {
+            if (clip_fea != nullptr) {
                 if (params.model_type == "i2v") {
                     auto img_emb     = std::dynamic_pointer_cast<MLPProj>(blocks["img_emb"]);
                     auto context_img = img_emb->forward(ctx, clip_fea);            // [N, context_img_len, dim]
@@ -1930,7 +1932,7 @@ namespace WAN {
             }
 
             // vace_patch_embedding
-            ggml_tensor* c = NULL;
+            ggml_tensor* c = nullptr;
             if (params.vace_layers > 0) {
                 auto vace_patch_embedding = std::dynamic_pointer_cast<Conv3d>(blocks["vace_patch_embedding"]);
 
@@ -1971,9 +1973,9 @@ namespace WAN {
                                     struct ggml_tensor* timestep,
                                     struct ggml_tensor* context,
                                     struct ggml_tensor* pe,
-                                    struct ggml_tensor* clip_fea        = NULL,
-                                    struct ggml_tensor* time_dim_concat = NULL,
-                                    struct ggml_tensor* vace_context    = NULL,
+                                    struct ggml_tensor* clip_fea        = nullptr,
+                                    struct ggml_tensor* time_dim_concat = nullptr,
+                                    struct ggml_tensor* vace_context    = nullptr,
                                     float vace_strength                 = 1.f,
                                     int64_t N                           = 1) {
             // Forward pass of DiT.
@@ -1997,7 +1999,7 @@ namespace WAN {
             int64_t h_len = ((H + (std::get<1>(params.patch_size) / 2)) / std::get<1>(params.patch_size));
             int64_t w_len = ((W + (std::get<2>(params.patch_size) / 2)) / std::get<2>(params.patch_size));
 
-            if (time_dim_concat != NULL) {
+            if (time_dim_concat != nullptr) {
                 time_dim_concat = pad_to_patch_size(ctx, time_dim_concat);
                 x               = ggml_concat(ctx, x, time_dim_concat, 2);  // [N*C, (T+pad_t) + (T2+pad_t2), H + pad_h, W + pad_w]
                 t_len           = ((x->ne[2] + (std::get<0>(params.patch_size) / 2)) / std::get<0>(params.patch_size));
@@ -2134,7 +2136,7 @@ namespace WAN {
             wan.init(params_ctx, tensor_types, prefix);
         }
 
-        std::string get_desc() {
+        std::string get_desc() override {
             return desc;
         }
 
@@ -2145,10 +2147,10 @@ namespace WAN {
         struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                         struct ggml_tensor* timesteps,
                                         struct ggml_tensor* context,
-                                        struct ggml_tensor* clip_fea        = NULL,
-                                        struct ggml_tensor* c_concat        = NULL,
-                                        struct ggml_tensor* time_dim_concat = NULL,
-                                        struct ggml_tensor* vace_context    = NULL,
+                                        struct ggml_tensor* clip_fea        = nullptr,
+                                        struct ggml_tensor* c_concat        = nullptr,
+                                        struct ggml_tensor* time_dim_concat = nullptr,
+                                        struct ggml_tensor* vace_context    = nullptr,
                                         float vace_strength                 = 1.f) {
             struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, WAN_GRAPH_SIZE, false);
 
@@ -2174,10 +2176,10 @@ namespace WAN {
             auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, wan_params.axes_dim_sum / 2, pos_len);
             // pe->data = pe_vec.data();
             // print_ggml_tensor(pe);
-            // pe->data = NULL;
+            // pe->data = nullptr;
             set_backend_tensor_data(pe, pe_vec.data());
 
-            if (c_concat != NULL) {
+            if (c_concat != nullptr) {
                 x = ggml_concat(compute_ctx, x, c_concat, 3);
             }
 
@@ -2201,13 +2203,13 @@ namespace WAN {
                      struct ggml_tensor* x,
                      struct ggml_tensor* timesteps,
                      struct ggml_tensor* context,
-                     struct ggml_tensor* clip_fea        = NULL,
-                     struct ggml_tensor* c_concat        = NULL,
-                     struct ggml_tensor* time_dim_concat = NULL,
-                     struct ggml_tensor* vace_context    = NULL,
+                     struct ggml_tensor* clip_fea        = nullptr,
+                     struct ggml_tensor* c_concat        = nullptr,
+                     struct ggml_tensor* time_dim_concat = nullptr,
+                     struct ggml_tensor* vace_context    = nullptr,
                      float vace_strength                 = 1.f,
-                     struct ggml_tensor** output         = NULL,
-                     struct ggml_context* output_ctx     = NULL) {
+                     struct ggml_tensor** output         = nullptr,
+                     struct ggml_context* output_ctx     = nullptr) {
             auto get_graph = [&]() -> struct ggml_cgraph* {
                 return build_graph(x, timesteps, context, clip_fea, c_concat, time_dim_concat, vace_context, vace_strength);
             };
@@ -2218,11 +2220,11 @@ namespace WAN {
         void test() {
             struct ggml_init_params params;
             params.mem_size   = static_cast<size_t>(200 * 1024 * 1024);  // 200 MB
-            params.mem_buffer = NULL;
+            params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
             struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != NULL);
+            GGML_ASSERT(work_ctx != nullptr);
 
             {
                 // cpu f16: pass
@@ -2244,10 +2246,10 @@ namespace WAN {
                 // auto clip_fea = load_tensor_from_file(work_ctx, "wan_dit_clip_fea.bin");
                 // print_ggml_tensor(clip_fea);
 
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
 
                 int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, NULL, NULL, NULL, NULL, 1.f, &out, work_ctx);
+                compute(8, x, timesteps, context, nullptr, nullptr, nullptr, nullptr, 1.f, &out, work_ctx);
                 int t1 = ggml_time_ms();
 
                 print_ggml_tensor(out);
@@ -2275,12 +2277,12 @@ namespace WAN {
                 }
             }
 
-            std::shared_ptr<WanRunner> wan = std::shared_ptr<WanRunner>(new WanRunner(backend,
-                                                                                      false,
-                                                                                      tensor_types,
-                                                                                      "model.diffusion_model",
-                                                                                      VERSION_WAN2_2_TI2V,
-                                                                                      true));
+            std::shared_ptr<WanRunner> wan = std::make_shared<WanRunner>(backend,
+                                                                         false,
+                                                                         tensor_types,
+                                                                         "model.diffusion_model",
+                                                                         VERSION_WAN2_2_TI2V,
+                                                                         true);
 
             wan->alloc_params_buffer();
             std::map<std::string, ggml_tensor*> tensors;