fix format

jiqing-feng · jiqing-feng · commit 0f78bada2df0 · 2025-11-13T10:14:59.000Z
Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
@@ -204,6 +204,7 @@ def _(
         return out
 
     if has_avx512bf16():
+
         @register_kernel("bitsandbytes::gemv_4bit", "cpu")
         def _(
             A: torch.Tensor,
diff --git a/bitsandbytes/backends/utils.py b/bitsandbytes/backends/utils.py
@@ -1,7 +1,6 @@
 import subprocess
 
 from packaging import version
-from collections.abc import Sequence
 import torch
 
 try:
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -2243,6 +2243,7 @@ def convert_weight_packed_for_cpu(qweight: torch.Tensor, quant_state: QuantState
     quant_state.dtype = torch.bfloat16
     return final_qweight, quant_state
 
+
 def has_avx512bf16():
     if hasattr(lib, "has_avx512bf16_cpu") and lib.has_avx512bf16_cpu():
         return True
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
@@ -3,7 +3,7 @@
 import subprocess
 
 import torch
-from collections.abc import Sequence
+
 
 def outlier_hook(module, input):
     assert isinstance(module, torch.nn.Linear)
diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h
@@ -1,33 +1,33 @@
 #ifndef BITSANDBYTES_CPU_OPS_H
 #define BITSANDBYTES_CPU_OPS_H
 
+#include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <thread>
-#include <cmath>
-#include <algorithm>
 #include <type_traits>
 
 #if defined(_OPENMP)
-    #include <omp.h>
+#include <omp.h>
 #endif
 
 // amx-bf16
 #define TILE_M 16
 #define TILE_N 16
 #define TILE_K 32
 // work around compiler internal error
-#define BLOCK_K 128  // 4 * TILE_K
+#define BLOCK_K 128 // 4 * TILE_K
 
 // block size for AMX gemm
 constexpr int block_size_m() { return 2 * TILE_M; }
+
 constexpr int block_size_n() { return 2 * TILE_N; }
 
-template <typename T>
-inline int get_cache_blocks(int chunk_size) {
-  // L2 2MB and ratio of 50%
-  const int L2_size = 2048 * 1024 >> 1;
-  return std::max(1, int(L2_size / (chunk_size * sizeof(T))));
+template <typename T> inline int get_cache_blocks(int chunk_size) {
+    // L2 2MB and ratio of 50%
+    const int L2_size = 2048 * 1024 >> 1;
+    return std::max(1, int(L2_size / (chunk_size * sizeof(T))));
 }
 
 // forced unroll for perf critical path
@@ -37,25 +37,22 @@ inline int get_cache_blocks(int chunk_size) {
 #define ALWAYS_INLINE inline
 #endif
 
-template <int n>
-struct Unroll {
-  template <typename Func, typename... Args>
-  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
-    Unroll<n - 1>{}(f, args...);
-    f(std::integral_constant<int, n - 1>{}, args...);
-  }
+template <int n> struct Unroll {
+    template <typename Func, typename... Args> ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+        Unroll<n - 1>{}(f, args...);
+        f(std::integral_constant<int, n - 1>{}, args...);
+    }
 };
 
-template <>
-struct Unroll<1> {
-  template <typename Func, typename... Args>
-  ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
-    f(std::integral_constant<int, 0>{}, args...);
-  }
+template <> struct Unroll<1> {
+    template <typename Func, typename... Args> ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+        f(std::integral_constant<int, 0>{}, args...);
+    }
 };
 
-template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-inline T div_up(T x, T y) { return (x + y - 1) / y; }
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0> inline T div_up(T x, T y) {
+    return (x + y - 1) / y;
+}
 
 inline int get_max_threads() {
 #if defined(_OPENMP)
@@ -68,60 +65,59 @@ inline int get_max_threads() {
 
 inline int adjust_num_threads(int m) {
     int actual_nth = get_max_threads();
-    if (m == 1) return actual_nth;
+    if (m == 1)
+        return actual_nth;
     return std::max(1, (actual_nth >> 1) * 2);
 }
 
-template <typename func_t>
-inline void parallel_2d(int m, int n, const func_t& f) {
-  // make sure we have even num_threads
-  int nth = adjust_num_threads(m);
-
-  // [NOTE] thread blocking:
-  //
-  //   1) prefer square block per thread
-  //   2) use even number of CPU cores
-  //   3) use all `num_threads` cores
-  //
-  //   we have:
-  //     TM * TN = T
-  //     BM / TM = BN / TN
-  //   then:
-  //     TM = ((BM / BN) * T) ^ 0.5
-  //
-  float r = float(m) / n;
-  int nth_m = std::ceil(std::sqrt(r * nth));
-  int nth_n = 1;
-  for (; nth_m > 0; --nth_m) {
-    nth_n = nth / nth_m;
-    if (nth_m * nth_n == nth) {
-      break;
+template <typename func_t> inline void parallel_2d(int m, int n, const func_t& f) {
+    // make sure we have even num_threads
+    int nth = adjust_num_threads(m);
+
+    // [NOTE] thread blocking:
+    //
+    //   1) prefer square block per thread
+    //   2) use even number of CPU cores
+    //   3) use all `num_threads` cores
+    //
+    //   we have:
+    //     TM * TN = T
+    //     BM / TM = BN / TN
+    //   then:
+    //     TM = ((BM / BN) * T) ^ 0.5
+    //
+    float r = float(m) / n;
+    int nth_m = std::ceil(std::sqrt(r * nth));
+    int nth_n = 1;
+    for (; nth_m > 0; --nth_m) {
+        nth_n = nth / nth_m;
+        if (nth_m * nth_n == nth) {
+            break;
+        }
     }
-  }
 
 #if defined(_OPENMP)
 #pragma omp parallel num_threads(nth)
-  {
-    int ith = omp_get_thread_num();
-    int ith_m = ith / nth_n;
-    int ith_n = ith % nth_n;
+    {
+        int ith = omp_get_thread_num();
+        int ith_m = ith / nth_n;
+        int ith_n = ith % nth_n;
 
-    int thread_block_m = div_up(m, nth_m);
-    int thread_block_n = div_up(n, nth_n);
+        int thread_block_m = div_up(m, nth_m);
+        int thread_block_n = div_up(n, nth_n);
 
-    int begin_m = ith_m * thread_block_m;
-    int end_m = std::min(m, begin_m + thread_block_m);
-    int begin_n = ith_n * thread_block_n;
-    int end_n = std::min(n, begin_n + thread_block_n);
+        int begin_m = ith_m * thread_block_m;
+        int end_m = std::min(m, begin_m + thread_block_m);
+        int begin_n = ith_n * thread_block_n;
+        int end_n = std::min(n, begin_n + thread_block_n);
 
-    f(begin_m, end_m, begin_n, end_n);
-  }
+        f(begin_m, end_m, begin_n, end_n);
+    }
 #else
-  f(0, m, 0, n);
+    f(0, m, 0, n);
 #endif
 }
 
-
 void quantize_cpu(float* code, float* A, float* absmax, unsigned char* out, long long blocksize, long long n);
 
 typedef enum DataType_t {
@@ -155,17 +151,17 @@ static inline fp16_t float_to_fp16(float x) {
     uint32_t bits;
     std::memcpy(&bits, &x, 4);
     uint32_t sign = (bits >> 31) & 0x1;
-    uint32_t exp  = (bits >> 23) & 0xFF;
+    uint32_t exp = (bits >> 23) & 0xFF;
     uint32_t mant = bits & 0x7FFFFF;
 
     uint16_t h;
-    if (exp == 0xFF) { // Inf / NaN
+    if (exp == 0xFF) {                      // Inf / NaN
         uint16_t mant16 = mant ? 0x200 : 0; // quiet NaN: set MSB of mantissa
         h = (sign << 15) | (0x1F << 10) | mant16;
-    } else if (exp > 0x70 + 0x1E) { // overflow: exp_f -127 +15 > 30  (exp_f > 142)
+    } else if (exp > 0x70 + 0x1E) {      // overflow: exp_f -127 +15 > 30  (exp_f > 142)
         h = (sign << 15) | (0x1F << 10); // Inf
-    } else if (exp < 0x71) { // subnormal or zero (exp_f < 113)
-        if (exp < 0x67) { // too small -> zero (exp_f < 103)
+    } else if (exp < 0x71) {             // subnormal or zero (exp_f < 113)
+        if (exp < 0x67) {                // too small -> zero (exp_f < 103)
             h = (sign << 15);
         } else {
             // subnormal: implicit leading 1
@@ -281,16 +277,22 @@ inline float dDequantizeNF4(unsigned char val) {
         return -1.0f; //*0000
 }
 
-
 template <typename T>
-void dequantizeBlockwise8bitCpu(float* code, unsigned char* A, const float* absmax, T* out, long long blocksize, long long n);
+void dequantizeBlockwise8bitCpu(
+    float* code, unsigned char* A, const float* absmax, T* out, long long blocksize, long long n
+);
 
 template <typename T, int DATA_TYPE>
-void dequantizeBlockwise4bitCpu(unsigned char* A, const float* absmax, T* out, long long blocksize, long long m, long long n);
+void dequantizeBlockwise4bitCpu(
+    unsigned char* A, const float* absmax, T* out, long long blocksize, long long m, long long n
+);
 
 #if defined(__AVX512F__) && defined(__AVX512BF16__)
-    template <typename T, int DATA_TYPE>
-    void gemv_4bit_inference(int64_t M, int64_t N, int64_t K, const T* __restrict__ x, const unsigned char* __restrict__ w, const T* __restrict__ absmax, T* __restrict__ out, int64_t blocksize, int64_t x_stride, int64_t out_stride);
+template <typename T, int DATA_TYPE>
+void gemv_4bit_inference(
+    int64_t M, int64_t N, int64_t K, const T* __restrict__ x, const unsigned char* __restrict__ w,
+    const T* __restrict__ absmax, T* __restrict__ out, int64_t blocksize, int64_t x_stride, int64_t out_stride
+);
 #endif
 
 #if defined(__AVX512F__)
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -847,11 +847,13 @@ void cdequantize_blockwise_cpu_fp32(
 ) {
     dequantizeBlockwise8bitCpu<float>(code, A, absmax, out, blocksize, n);
 }
+
 void cdequantize_blockwise_cpu_bf16(
     float* code, unsigned char* A, const float* absmax, bf16_t* out, long long blocksize, long long n
 ) {
     dequantizeBlockwise8bitCpu<bf16_t>(code, A, absmax, out, blocksize, n);
 }
+
 void cdequantize_blockwise_cpu_fp16(
     float* code, unsigned char* A, const float* absmax, fp16_t* out, long long blocksize, long long n
 ) {
@@ -863,11 +865,13 @@ void cdequantize_blockwise_cpu_fp4_fp32(
 ) {
     dequantizeBlockwise4bitCpu<float, FP4>(A, absmax, out, blocksize, m, n);
 }
+
 void cdequantize_blockwise_cpu_fp4_bf16(
     unsigned char* A, const float* absmax, bf16_t* out, long long blocksize, long long m, long long n
 ) {
     dequantizeBlockwise4bitCpu<bf16_t, FP4>(A, absmax, out, blocksize, m, n);
 }
+
 void cdequantize_blockwise_cpu_fp4_fp16(
     unsigned char* A, const float* absmax, fp16_t* out, long long blocksize, long long m, long long n
 ) {
@@ -879,11 +883,13 @@ void cdequantize_blockwise_cpu_nf4_fp32(
 ) {
     dequantizeBlockwise4bitCpu<float, NF4>(A, absmax, out, blocksize, m, n);
 }
+
 void cdequantize_blockwise_cpu_nf4_bf16(
     unsigned char* A, const float* absmax, bf16_t* out, long long blocksize, long long m, long long n
 ) {
     dequantizeBlockwise4bitCpu<bf16_t, NF4>(A, absmax, out, blocksize, m, n);
 }
+
 void cdequantize_blockwise_cpu_nf4_fp16(
     unsigned char* A, const float* absmax, fp16_t* out, long long blocksize, long long m, long long n
 ) {
@@ -892,24 +898,22 @@ void cdequantize_blockwise_cpu_nf4_fp16(
 
 #if defined(__AVX512F__) && defined(__AVX512BF16__)
 void gemv_4bit_inference_cpu_fp4_bf16(
-    int64_t M, int64_t N, int64_t K,
-    const bf16_t* __restrict__ x, const unsigned char* __restrict__ w,
-    const bf16_t* __restrict__ absmax, bf16_t* __restrict__ out,
-    int64_t blocksize, int64_t x_stride, int64_t out_stride
+    int64_t M, int64_t N, int64_t K, const bf16_t* __restrict__ x, const unsigned char* __restrict__ w,
+    const bf16_t* __restrict__ absmax, bf16_t* __restrict__ out, int64_t blocksize, int64_t x_stride, int64_t out_stride
 ) {
     gemv_4bit_inference<bf16_t, FP4>(M, N, K, x, w, absmax, out, blocksize, x_stride, out_stride);
 }
+
 void gemv_4bit_inference_cpu_nf4_bf16(
-    int64_t M, int64_t N, int64_t K,
-    const bf16_t* __restrict__ x, const unsigned char* __restrict__ w,
-    const bf16_t* __restrict__ absmax, bf16_t* __restrict__ out,
-    int64_t blocksize, int64_t x_stride, int64_t out_stride
+    int64_t M, int64_t N, int64_t K, const bf16_t* __restrict__ x, const unsigned char* __restrict__ w,
+    const bf16_t* __restrict__ absmax, bf16_t* __restrict__ out, int64_t blocksize, int64_t x_stride, int64_t out_stride
 ) {
     gemv_4bit_inference<bf16_t, NF4>(M, N, K, x, w, absmax, out, blocksize, x_stride, out_stride);
 }
 #endif
 #if defined(__AVX512F__)
-bool has_avx512f_cpu() return has_avx512f()
-bool has_avx512bf16_cpu() return has_avx512bf16()
+bool has_avx512f_cpu() { return has_avx512f() }
+
+bool has_avx512bf16_cpu() { return has_avx512bf16() }
 #endif
 }