Skip to content

Commit 1e596c5

Browse files
committed
update stable-diffusion.cpp to master-c648001 (+fixes)
1 parent 89feffc commit 1e596c5

31 files changed

+768430
-1764
lines changed

otherarch/sdcpp/avi_writer.h

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
#ifndef __AVI_WRITER_H__
2+
#define __AVI_WRITER_H__
3+
4+
#include <stdint.h>
5+
#include <stdio.h>
6+
#include <stdlib.h>
7+
#include <string.h>
8+
9+
#include "stable-diffusion.h"
10+
11+
#ifndef INCLUDE_STB_IMAGE_WRITE_H
12+
#include "stb_image_write.h"
13+
#endif
14+
15+
typedef struct {
16+
uint32_t offset;
17+
uint32_t size;
18+
} avi_index_entry;
19+
20+
// Write 32-bit little-endian integer
21+
void write_u32_le(FILE* f, uint32_t val) {
22+
fwrite(&val, 4, 1, f);
23+
}
24+
25+
// Write 16-bit little-endian integer
26+
void write_u16_le(FILE* f, uint16_t val) {
27+
fwrite(&val, 2, 1, f);
28+
}
29+
30+
/**
31+
* Create an MJPG AVI file from an array of sd_image_t images.
32+
* Images are encoded to JPEG using stb_image_write.
33+
*
34+
* @param filename Output AVI file name.
35+
* @param images Array of input images.
36+
* @param num_images Number of images in the array.
37+
* @param fps Frames per second for the video.
38+
* @param quality JPEG quality (0-100).
39+
* @return 0 on success, -1 on failure.
40+
*/
41+
int create_mjpg_avi_from_sd_images(const char* filename, sd_image_t* images, int num_images, int fps, int quality = 90) {
42+
if (num_images == 0) {
43+
fprintf(stderr, "Error: Image array is empty.\n");
44+
return -1;
45+
}
46+
47+
FILE* f = fopen(filename, "wb");
48+
if (!f) {
49+
perror("Error opening file for writing");
50+
return -1;
51+
}
52+
53+
uint32_t width = images[0].width;
54+
uint32_t height = images[0].height;
55+
uint32_t channels = images[0].channel;
56+
if (channels != 3 && channels != 4) {
57+
fprintf(stderr, "Error: Unsupported channel count: %u\n", channels);
58+
fclose(f);
59+
return -1;
60+
}
61+
62+
// --- RIFF AVI Header ---
63+
fwrite("RIFF", 4, 1, f);
64+
long riff_size_pos = ftell(f);
65+
write_u32_le(f, 0); // Placeholder for file size
66+
fwrite("AVI ", 4, 1, f);
67+
68+
// 'hdrl' LIST (header list)
69+
fwrite("LIST", 4, 1, f);
70+
write_u32_le(f, 4 + 8 + 56 + 8 + 4 + 8 + 56 + 8 + 40);
71+
fwrite("hdrl", 4, 1, f);
72+
73+
// 'avih' chunk (AVI main header)
74+
fwrite("avih", 4, 1, f);
75+
write_u32_le(f, 56);
76+
write_u32_le(f, 1000000 / fps); // Microseconds per frame
77+
write_u32_le(f, 0); // Max bytes per second
78+
write_u32_le(f, 0); // Padding granularity
79+
write_u32_le(f, 0x110); // Flags (HASINDEX | ISINTERLEAVED)
80+
write_u32_le(f, num_images); // Total frames
81+
write_u32_le(f, 0); // Initial frames
82+
write_u32_le(f, 1); // Number of streams
83+
write_u32_le(f, width * height * 3); // Suggested buffer size
84+
write_u32_le(f, width);
85+
write_u32_le(f, height);
86+
write_u32_le(f, 0); // Reserved
87+
write_u32_le(f, 0); // Reserved
88+
write_u32_le(f, 0); // Reserved
89+
write_u32_le(f, 0); // Reserved
90+
91+
// 'strl' LIST (stream list)
92+
fwrite("LIST", 4, 1, f);
93+
write_u32_le(f, 4 + 8 + 56 + 8 + 40);
94+
fwrite("strl", 4, 1, f);
95+
96+
// 'strh' chunk (stream header)
97+
fwrite("strh", 4, 1, f);
98+
write_u32_le(f, 56);
99+
fwrite("vids", 4, 1, f); // Stream type: video
100+
fwrite("MJPG", 4, 1, f); // Codec: Motion JPEG
101+
write_u32_le(f, 0); // Flags
102+
write_u16_le(f, 0); // Priority
103+
write_u16_le(f, 0); // Language
104+
write_u32_le(f, 0); // Initial frames
105+
write_u32_le(f, 1); // Scale
106+
write_u32_le(f, fps); // Rate
107+
write_u32_le(f, 0); // Start
108+
write_u32_le(f, num_images); // Length
109+
write_u32_le(f, width * height * 3); // Suggested buffer size
110+
write_u32_le(f, (uint32_t)-1); // Quality
111+
write_u32_le(f, 0); // Sample size
112+
write_u16_le(f, 0); // rcFrame.left
113+
write_u16_le(f, 0); // rcFrame.top
114+
write_u16_le(f, 0); // rcFrame.right
115+
write_u16_le(f, 0); // rcFrame.bottom
116+
117+
// 'strf' chunk (stream format: BITMAPINFOHEADER)
118+
fwrite("strf", 4, 1, f);
119+
write_u32_le(f, 40);
120+
write_u32_le(f, 40); // biSize
121+
write_u32_le(f, width);
122+
write_u32_le(f, height);
123+
write_u16_le(f, 1); // biPlanes
124+
write_u16_le(f, 24); // biBitCount
125+
fwrite("MJPG", 4, 1, f); // biCompression (FOURCC)
126+
write_u32_le(f, width * height * 3); // biSizeImage
127+
write_u32_le(f, 0); // XPelsPerMeter
128+
write_u32_le(f, 0); // YPelsPerMeter
129+
write_u32_le(f, 0); // Colors used
130+
write_u32_le(f, 0); // Colors important
131+
132+
// 'movi' LIST (video frames)
133+
long movi_list_pos = ftell(f);
134+
fwrite("LIST", 4, 1, f);
135+
long movi_size_pos = ftell(f);
136+
write_u32_le(f, 0); // Placeholder for movi size
137+
fwrite("movi", 4, 1, f);
138+
139+
avi_index_entry* index = (avi_index_entry*)malloc(sizeof(avi_index_entry) * num_images);
140+
if (!index) {
141+
fclose(f);
142+
return -1;
143+
}
144+
145+
// Encode and write each frame as JPEG
146+
struct {
147+
uint8_t* buf;
148+
size_t size;
149+
} jpeg_data;
150+
151+
for (int i = 0; i < num_images; i++) {
152+
jpeg_data.buf = NULL;
153+
jpeg_data.size = 0;
154+
155+
// Callback function to collect JPEG data into memory
156+
auto write_to_buf = [](void* context, void* data, int size) {
157+
auto jd = (decltype(jpeg_data)*)context;
158+
jd->buf = (uint8_t*)realloc(jd->buf, jd->size + size);
159+
memcpy(jd->buf + jd->size, data, size);
160+
jd->size += size;
161+
};
162+
163+
// Encode to JPEG in memory
164+
stbi_write_jpg_to_func(
165+
write_to_buf,
166+
&jpeg_data,
167+
images[i].width,
168+
images[i].height,
169+
channels,
170+
images[i].data,
171+
quality);
172+
173+
// Write '00dc' chunk (video frame)
174+
fwrite("00dc", 4, 1, f);
175+
write_u32_le(f, jpeg_data.size);
176+
index[i].offset = ftell(f) - 8;
177+
index[i].size = jpeg_data.size;
178+
fwrite(jpeg_data.buf, 1, jpeg_data.size, f);
179+
180+
// Align to even byte size
181+
if (jpeg_data.size % 2)
182+
fputc(0, f);
183+
184+
free(jpeg_data.buf);
185+
}
186+
187+
// Finalize 'movi' size
188+
long cur_pos = ftell(f);
189+
long movi_size = cur_pos - movi_size_pos - 4;
190+
fseek(f, movi_size_pos, SEEK_SET);
191+
write_u32_le(f, movi_size);
192+
fseek(f, cur_pos, SEEK_SET);
193+
194+
// Write 'idx1' index
195+
fwrite("idx1", 4, 1, f);
196+
write_u32_le(f, num_images * 16);
197+
for (int i = 0; i < num_images; i++) {
198+
fwrite("00dc", 4, 1, f);
199+
write_u32_le(f, 0x10);
200+
write_u32_le(f, index[i].offset);
201+
write_u32_le(f, index[i].size);
202+
}
203+
204+
// Finalize RIFF size
205+
cur_pos = ftell(f);
206+
long file_size = cur_pos - riff_size_pos - 4;
207+
fseek(f, riff_size_pos, SEEK_SET);
208+
write_u32_le(f, file_size);
209+
fseek(f, cur_pos, SEEK_SET);
210+
211+
fclose(f);
212+
free(index);
213+
214+
return 0;
215+
}
216+
217+
#endif // __AVI_WRITER_H__

otherarch/sdcpp/clip.hpp

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -179,9 +179,9 @@ class CLIPTokenizer {
179179

180180
auto it = encoder.find(utf8_to_utf32("img</w>"));
181181
if (it != encoder.end()) {
182-
LOG_DEBUG(" trigger word img already in vocab");
182+
LOG_DEBUG("trigger word img already in vocab");
183183
} else {
184-
LOG_DEBUG(" trigger word img not in vocab yet");
184+
LOG_DEBUG("trigger word img not in vocab yet");
185185
}
186186

187187
int rank = 0;
@@ -488,14 +488,14 @@ struct CLIPLayer : public GGMLBlock {
488488
blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
489489
}
490490

491-
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = true) {
491+
struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, bool mask = true) {
492492
// x: [N, n_token, d_model]
493493
auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
494494
auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
495495
auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
496496
auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);
497497

498-
x = ggml_add(ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
498+
x = ggml_add(ctx, x, self_attn->forward(ctx, backend, layer_norm1->forward(ctx, x), mask));
499499
x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
500500
return x;
501501
}
@@ -517,7 +517,11 @@ struct CLIPEncoder : public GGMLBlock {
517517
}
518518
}
519519

520-
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) {
520+
struct ggml_tensor* forward(struct ggml_context* ctx,
521+
ggml_backend_t backend,
522+
struct ggml_tensor* x,
523+
int clip_skip = -1,
524+
bool mask = true) {
521525
// x: [N, n_token, d_model]
522526
int layer_idx = n_layer - 1;
523527
// LOG_DEBUG("clip_skip %d", clip_skip);
@@ -532,7 +536,7 @@ struct CLIPEncoder : public GGMLBlock {
532536
}
533537
std::string name = "layers." + std::to_string(i);
534538
auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
535-
x = layer->forward(ctx, x, mask); // [N, n_token, d_model]
539+
x = layer->forward(ctx, backend, x, mask); // [N, n_token, d_model]
536540
// LOG_DEBUG("layer %d", i);
537541
}
538542
return x;
@@ -718,6 +722,7 @@ class CLIPTextModel : public GGMLBlock {
718722
}
719723

720724
struct ggml_tensor* forward(struct ggml_context* ctx,
725+
ggml_backend_t backend,
721726
struct ggml_tensor* input_ids,
722727
struct ggml_tensor* tkn_embeddings,
723728
size_t max_token_idx = 0,
@@ -728,7 +733,7 @@ class CLIPTextModel : public GGMLBlock {
728733
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
729734

730735
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
731-
x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
736+
x = encoder->forward(ctx, backend, x, return_pooled ? -1 : clip_skip, true);
732737
if (return_pooled || with_final_ln) {
733738
x = final_layer_norm->forward(ctx, x);
734739
}
@@ -739,7 +744,7 @@ class CLIPTextModel : public GGMLBlock {
739744
if (text_projection != NULL) {
740745
pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
741746
} else {
742-
LOG_DEBUG("Missing text_projection matrix, assuming identity...");
747+
LOG_DEBUG("identity projection");
743748
}
744749
return pooled; // [hidden_size, 1, 1]
745750
}
@@ -780,7 +785,11 @@ class CLIPVisionModel : public GGMLBlock {
780785
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
781786
}
782787

783-
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
788+
struct ggml_tensor* forward(struct ggml_context* ctx,
789+
ggml_backend_t backend,
790+
struct ggml_tensor* pixel_values,
791+
bool return_pooled = true,
792+
int clip_skip = -1) {
784793
// pixel_values: [N, num_channels, image_size, image_size]
785794
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
786795
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
@@ -789,7 +798,7 @@ class CLIPVisionModel : public GGMLBlock {
789798

790799
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
791800
x = pre_layernorm->forward(ctx, x);
792-
x = encoder->forward(ctx, x, -1, false);
801+
x = encoder->forward(ctx, backend, x, clip_skip, false);
793802
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
794803
auto last_hidden_state = x;
795804
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
@@ -857,29 +866,37 @@ class CLIPVisionModelProjection : public GGMLBlock {
857866
blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
858867
}
859868

860-
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
869+
struct ggml_tensor* forward(struct ggml_context* ctx,
870+
ggml_backend_t backend,
871+
struct ggml_tensor* pixel_values,
872+
bool return_pooled = true,
873+
int clip_skip = -1) {
861874
// pixel_values: [N, num_channels, image_size, image_size]
862-
// return: [N, projection_dim]
875+
// return: [N, projection_dim] if return_pooled else [N, n_token, hidden_size]
863876
auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
864877
auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
865878

866-
auto x = vision_model->forward(ctx, pixel_values); // [N, hidden_size]
867-
x = visual_projection->forward(ctx, x); // [N, projection_dim]
879+
auto x = vision_model->forward(ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
868880

869-
return x; // [N, projection_dim]
881+
if (return_pooled) {
882+
x = visual_projection->forward(ctx, x); // [N, projection_dim]
883+
}
884+
885+
return x;
870886
}
871887
};
872888

873889
struct CLIPTextModelRunner : public GGMLRunner {
874890
CLIPTextModel model;
875891

876892
CLIPTextModelRunner(ggml_backend_t backend,
893+
bool offload_params_to_cpu,
877894
const String2GGMLType& tensor_types,
878895
const std::string prefix,
879896
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
880897
bool with_final_ln = true,
881898
int clip_skip_value = -1)
882-
: GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
899+
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
883900
model.init(params_ctx, tensor_types, prefix);
884901
}
885902

@@ -896,6 +913,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
896913
}
897914

898915
struct ggml_tensor* forward(struct ggml_context* ctx,
916+
ggml_backend_t backend,
899917
struct ggml_tensor* input_ids,
900918
struct ggml_tensor* embeddings,
901919
size_t max_token_idx = 0,
@@ -907,7 +925,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
907925
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
908926
}
909927

910-
return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
928+
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
911929
}
912930

913931
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
@@ -933,7 +951,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
933951
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
934952
}
935953

936-
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
954+
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
937955

938956
ggml_build_forward_expand(gf, hidden_states);
939957

0 commit comments

Comments
 (0)