Skip to content

Commit d9b625e

Browse files
committed
ggml-quants : handle imatrix for MXFP4
1 parent be48528 commit d9b625e

File tree

3 files changed

+483
-6
lines changed

3 files changed

+483
-6
lines changed

ggml/src/ggml-impl.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,9 +468,22 @@ static inline float ggml_e8m0_to_fp32_half(uint8_t x) {
468468
return result;
469469
}
470470

471+
static inline uint8_t ggml_fp32_to_e8m0(float x) {
472+
uint32_t bits;
473+
474+
memcpy(&bits, &x, sizeof(float));
475+
476+
// round half-way away from zero
477+
bits += (bits & 0x00400000) << 1;
478+
479+
return (uint8_t) (bits >> 23);
480+
}
481+
471482
#define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x)
472483
#define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x)
473484

485+
#define GGML_FP32_TO_E8M0(x) ggml_fp32_to_e8m0(x)
486+
474487
/**
475488
* Converts brain16 to float32.
476489
*

0 commit comments

Comments
 (0)