Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,15 +245,15 @@ class FeedForward : public GGMLBlock {
Activation activation = Activation::GEGLU,
bool force_prec_f32 = false) {
int64_t inner_dim = dim * mult;

SD_UNUSED(force_prec_f32);
if (activation == Activation::GELU) {
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
} else {
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
}

// net_1 is nn.Dropout(), skip for inference
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32));
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
}

struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
Expand All @@ -264,7 +264,13 @@ class FeedForward : public GGMLBlock {
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);

x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
// The purpose of the scale here is to prevent NaN issues in certain situations.
// For example, when using Vulkan without enabling force_prec_f32,
// or when using CUDA but the weights are k-quants.
float scale = 1.f / 128.f;
x = ggml_scale(ctx, x, scale);
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
x = ggml_scale(ctx, x, 1.f / scale);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious which part in the CUDA backend causes the issue here? I assume you are working around some FP overflow?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It’s likely that ggml_mul_mat has a precision issue when the weights are k-quants.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why did Jeff's ggml_mul_mat_set_prec fix work for vulkan but not cuda, could cuda be ignoring that?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cuda approach to matmul is pretty different (see #851 (comment)). Anecdotally it seems to be less prone to precision issues, but I guess it can still run into problems.

return x;
}
};
Expand Down
4 changes: 4 additions & 0 deletions ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
#define __STATIC_INLINE__ static inline
#endif

#ifndef SD_UNUSED
#define SD_UNUSED(x) (void)(x)
#endif

__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
switch (level) {
case GGML_LOG_LEVEL_DEBUG:
Expand Down
Loading