Skip to content

Commit d232509

Browse files
committed
add qwen image model
1 parent d8d4c26 commit d232509

File tree

8 files changed

+725
-25
lines changed

8 files changed

+725
-25
lines changed

common.hpp

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ class ResBlock : public GGMLBlock {
177177
}
178178
};
179179

180-
class GEGLU : public GGMLBlock {
180+
class GEGLU : public UnaryBlock {
181181
protected:
182182
int64_t dim_in;
183183
int64_t dim_out;
@@ -216,14 +216,41 @@ class GEGLU : public GGMLBlock {
216216
}
217217
};
218218

219+
class GELU : public UnaryBlock {
220+
public:
221+
GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
222+
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
223+
}
224+
225+
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
226+
// x: [ne3, ne2, ne1, dim_in]
227+
// return: [ne3, ne2, ne1, dim_out]
228+
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
229+
230+
x = proj->forward(ctx, x);
231+
x = ggml_gelu_inplace(ctx, x);
232+
return x;
233+
}
234+
};
235+
219236
class FeedForward : public GGMLBlock {
220237
public:
238+
enum class Activation {
239+
GEGLU,
240+
GELU
241+
};
221242
FeedForward(int64_t dim,
222243
int64_t dim_out,
223-
int64_t mult = 4) {
244+
int64_t mult = 4,
245+
Activation activation = Activation::GEGLU) {
224246
int64_t inner_dim = dim * mult;
225247

226-
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
248+
if (activation == Activation::GELU) {
249+
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
250+
} else {
251+
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
252+
}
253+
227254
// net_1 is nn.Dropout(), skip for inference
228255
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
229256
}
@@ -232,7 +259,7 @@ class FeedForward : public GGMLBlock {
232259
// x: [ne3, ne2, ne1, dim]
233260
// return: [ne3, ne2, ne1, dim_out]
234261

235-
auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
262+
auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
236263
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
237264

238265
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]

examples/cli/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
#include "avi_writer.h"
2929

30-
#include "qwen.hpp"
30+
#include "qwen_image.hpp"
3131

3232
#if defined(_WIN32)
3333
#define NOMINMAX
@@ -1142,7 +1142,7 @@ int main(int argc, const char* argv[]) {
11421142
SDParams params;
11431143
params.verbose = true;
11441144
sd_set_log_callback(sd_log_cb, (void*)&params);
1145-
Qwen::Qwen2_5_VLEmbedder::load_from_file_and_test(argv[1]);
1145+
Qwen::QwenImageRunner::load_from_file_and_test(argv[1]);
11461146
exit(1);
11471147
parse_args(argc, argv, params);
11481148
params.sample_params.guidance.slg.layers = params.skip_layers.data();

ggml_extend.hpp

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1353,25 +1353,28 @@ __STATIC_INLINE__ std::vector<float> arange(float start, float end, float step =
13531353
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
13541354
__STATIC_INLINE__ std::vector<float> timestep_embedding(std::vector<float> timesteps,
13551355
int dim,
1356-
int max_period = 10000) {
1356+
int max_period = 10000,
1357+
bool flip_sin_to_cos = true,
1358+
float scale = 1.f) {
13571359
// timesteps: [N,]
13581360
// embedding: [N, dim]
1359-
size_t N = timesteps.size();
1360-
int acutual_dim = dim;
1361-
if (dim % 2 != 0) {
1362-
acutual_dim = dim + 1;
1363-
}
1364-
std::vector<float> embedding(N * acutual_dim, 0.f);
1361+
size_t N = timesteps.size();
1362+
std::vector<float> embedding(N * dim, 0.f);
13651363
int half = dim / 2;
13661364
std::vector<float> freqs(half);
13671365
for (int i = 0; i < half; ++i) {
13681366
freqs[i] = (float)std::exp(-std::log(max_period) * i / half);
13691367
}
13701368
for (int i = 0; i < N; ++i) {
13711369
for (int j = 0; j < half; ++j) {
1372-
float arg = timesteps[i] * freqs[j];
1373-
embedding[i * acutual_dim + j] = std::cos(arg);
1374-
embedding[i * acutual_dim + j + half] = std::sin(arg);
1370+
float arg = timesteps[i] * freqs[j] * scale;
1371+
if (flip_sin_to_cos) {
1372+
embedding[i * dim + j] = std::cos(arg);
1373+
embedding[i * dim + j + half] = std::sin(arg);
1374+
} else {
1375+
embedding[i * dim + j] = std::sin(arg);
1376+
embedding[i * dim + j + half] = std::cos(arg);
1377+
}
13751378
}
13761379
}
13771380
return embedding;
@@ -1392,11 +1395,7 @@ __STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context
13921395
// timesteps: [N,]
13931396
// embedding: [N, dim]
13941397
std::vector<float> embedding_vec = timestep_embedding(timesteps, dim, max_period);
1395-
int acutual_dim = dim;
1396-
if (dim % 2 != 0) {
1397-
acutual_dim = dim + 1;
1398-
}
1399-
struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps.size());
1398+
struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, timesteps.size());
14001399
if (embedding->data != NULL) {
14011400
memcpy(((char*)embedding->data), ((char*)embedding_vec.data()), ggml_nbytes(embedding));
14021401
} else {

model.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,7 @@ void preprocess_tensor(TensorStorage tensor_storage,
728728

729729
// convert unet transformer linear to conv2d 1x1
730730
if (starts_with(new_name, "model.diffusion_model.") &&
731+
!starts_with(new_name, "model.diffusion_model.proj_out.") &&
731732
(ends_with(new_name, "proj_in.weight") || ends_with(new_name, "proj_out.weight"))) {
732733
tensor_storage.unsqueeze();
733734
}

0 commit comments

Comments
 (0)