Skip to content

Commit 99d78d0

Browse files
committed
add base ltx2.3 support
1 parent 274ecd5 commit 99d78d0

19 files changed

Lines changed: 3799 additions & 149 deletions

examples/cli/main.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#include "common/media_io.h"
2020
#include "common/resource_owners.hpp"
2121
#include "image_metadata.h"
22-
#include "llm.hpp"
2322

2423
namespace fs = std::filesystem;
2524

@@ -501,15 +500,6 @@ int main(int argc, const char* argv[]) {
501500
SDContextParams ctx_params;
502501
SDGenerationParams gen_params;
503502

504-
cli_params.verbose = true;
505-
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
506-
GemmaTokenizer tokenizer;
507-
auto tokens = tokenizer.tokenize("<html> 一只可爱的小猫");
508-
for (auto token : tokens) {
509-
LOG_INFO("%d", token);
510-
}
511-
return 0;
512-
513503
parse_args(argc, argv, cli_params, ctx_params, gen_params);
514504
sd_set_log_callback(sd_log_cb, (void*)&cli_params);
515505
log_verbose = cli_params.verbose;

examples/common/common.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,10 @@ ArgOptions SDContextParams::get_options() {
340340
"--high-noise-diffusion-model",
341341
"path to the standalone high noise diffusion model",
342342
&high_noise_diffusion_model_path},
343+
{"",
344+
"--embeddings-connectors",
345+
"path to LTXAV embeddings connectors",
346+
&embeddings_connectors_path},
343347
{"",
344348
"--vae",
345349
"path to standalone vae model",
@@ -656,6 +660,7 @@ std::string SDContextParams::to_string() const {
656660
<< " llm_vision_path: \"" << llm_vision_path << "\",\n"
657661
<< " diffusion_model_path: \"" << diffusion_model_path << "\",\n"
658662
<< " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
663+
<< " embeddings_connectors_path: \"" << embeddings_connectors_path << "\",\n"
659664
<< " vae_path: \"" << vae_path << "\",\n"
660665
<< " taesd_path: \"" << taesd_path << "\",\n"
661666
<< " esrgan_path: \"" << esrgan_path << "\",\n"
@@ -712,6 +717,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
712717
llm_vision_path.c_str(),
713718
diffusion_model_path.c_str(),
714719
high_noise_diffusion_model_path.c_str(),
720+
embeddings_connectors_path.c_str(),
715721
vae_path.c_str(),
716722
taesd_path.c_str(),
717723
control_net_path.c_str(),
@@ -2180,6 +2186,7 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
21802186
params.strength = strength;
21812187
params.seed = seed;
21822188
params.video_frames = video_frames;
2189+
params.fps = fps;
21832190
params.vace_strength = vace_strength;
21842191
params.vae_tiling_params = vae_tiling_params;
21852192
params.cache = cache_params;

examples/common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ struct SDContextParams {
9292
std::string llm_vision_path;
9393
std::string diffusion_model_path;
9494
std::string high_noise_diffusion_model_path;
95+
std::string embeddings_connectors_path;
9596
std::string vae_path;
9697
std::string taesd_path;
9798
std::string esrgan_path;

include/stable-diffusion.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ typedef struct {
171171
const char* llm_vision_path;
172172
const char* diffusion_model_path;
173173
const char* high_noise_diffusion_model_path;
174+
const char* embeddings_connectors_path;
174175
const char* vae_path;
175176
const char* taesd_path;
176177
const char* control_net_path;
@@ -359,6 +360,7 @@ typedef struct {
359360
float strength;
360361
int64_t seed;
361362
int video_frames;
363+
int fps;
362364
float vace_strength;
363365
sd_tiling_params_t vae_tiling_params;
364366
sd_cache_params_t cache;

src/common_dit.hpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,64 @@ namespace DiT {
103103
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
104104
return x;
105105
}
106+
107+
inline ggml_tensor* patchify(ggml_context* ctx,
108+
ggml_tensor* x,
109+
int pt,
110+
int ph,
111+
int pw,
112+
int64_t N = 1) {
113+
// x: [N*C, T, H, W]
114+
// return: [N, h*w, C*pt*ph*pw]
115+
int64_t C = x->ne[3] / N;
116+
int64_t T = x->ne[2];
117+
int64_t H = x->ne[1];
118+
int64_t W = x->ne[0];
119+
int64_t t_len = T / pt;
120+
int64_t h_len = H / ph;
121+
int64_t w_len = W / pw;
122+
123+
GGML_ASSERT(C * N == x->ne[3]);
124+
GGML_ASSERT(t_len * pt == T && h_len * ph == H && w_len * pw == W);
125+
126+
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt, t_len * C * N); // [N*C*t_len, pt, h_len*ph, w_len*pw]
127+
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, h_len*ph, pt, w_len*pw]
128+
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph, h_len * t_len * C * N); // [N*C*t_len*h_len, ph, pt, w_len*pw]
129+
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt, ph, w_len*pw]
130+
x = ggml_reshape_4d(ctx, x, pw, w_len, ph * pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt*ph, w_len, pw]
131+
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, w_len, pt*ph, pw]
132+
x = ggml_reshape_4d(ctx, x, pw * ph * pt, w_len * h_len * t_len, C, N); // [N, C, t_len*h_len*w_len, pt*ph*pw]
133+
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N, t_len*h_len*w_len, C, pt*ph*pw]
134+
x = ggml_reshape_4d(ctx, x, pw * ph * pt * C, w_len * h_len * t_len, N, 1); // [N, t_len*h_len*w_len, C*pt*ph*pw]
135+
return x;
136+
}
137+
138+
inline ggml_tensor* unpatchify(ggml_context* ctx,
139+
ggml_tensor* x,
140+
int64_t t_len,
141+
int64_t h_len,
142+
int64_t w_len,
143+
int pt,
144+
int ph,
145+
int pw) {
146+
// x: [N, t_len*h_len*w_len, pt*ph*pw*C]
147+
// return: [N*C, t_len*pt, h_len*ph, w_len*pw]
148+
int64_t N = x->ne[3];
149+
int64_t C = x->ne[0] / pt / ph / pw;
150+
151+
GGML_ASSERT(C * pt * ph * pw == x->ne[0]);
152+
153+
x = ggml_reshape_4d(ctx, x, C, pw * ph * pt, w_len * h_len * t_len, N); // [N, t_len*h_len*w_len, pt*ph*pw, C]
154+
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, t_len*h_len*w_len, pt*ph*pw]
155+
x = ggml_reshape_4d(ctx, x, pw, ph * pt, w_len, h_len * t_len * C * N); // [N*C*t_len*h_len, w_len, pt*ph, pw]
156+
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, pt*ph, w_len, pw]
157+
x = ggml_reshape_4d(ctx, x, pw * w_len, ph, pt, h_len * t_len * C * N); // [N*C*t_len*h_len, pt, ph, w_len*pw]
158+
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len*h_len, ph, pt, w_len*pw]
159+
x = ggml_reshape_4d(ctx, x, pw * w_len, pt, ph * h_len, t_len * C * N); // [N*C*t_len, h_len*ph, pt, w_len*pw]
160+
x = ggml_ext_cont(ctx, ggml_ext_torch_permute(ctx, x, 0, 2, 1, 3)); // [N*C*t_len, pt, h_len*ph, w_len*pw]
161+
x = ggml_reshape_4d(ctx, x, pw * w_len, ph * h_len, pt * t_len, C * N); // [N*C, t_len*pt, h_len*ph, w_len*pw]
162+
return x;
163+
}
106164
} // namespace DiT
107165

108166
#endif // __COMMON_DIT_HPP__

0 commit comments

Comments
 (0)