Skip to content

Commit 4955d13

Browse files
Apply clang-format rules (#1678)
1 parent 61db085 commit 4955d13

File tree

11 files changed

+3777
-3496
lines changed

11 files changed

+3777
-3496
lines changed

csrc/common.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,12 @@ void quantize_block(const quantize_block_args& args) {
2626
if (idx < 255) {
2727
float dist_left = fabs(normed_value - (args.code[idx]));
2828
float dist_right = fabs(normed_value - (args.code[idx + 1]));
29-
if (dist_right < dist_left) { idx += 1; }
29+
if (dist_right < dist_left) {
30+
idx += 1;
31+
}
3032
}
3133

3234
// 5. store index
33-
args.out[i] = (unsigned char) idx;
35+
args.out[i] = (unsigned char)idx;
3436
}
3537
}

csrc/common.cuh

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,47 +2,48 @@
22

33
// TODO: Let's make some of these constexpr and put in a namespace.
44

5-
#define BNB_CC_MAXWELL 500
6-
#define BNB_CC_MAXWELL2 520
7-
#define BNB_CC_MAXWELL2_X1 530
8-
#define BNB_CC_PASCAL 600
9-
#define BNB_CC_PASCAL_X2 620
10-
#define BNB_CC_VOLTA 700
11-
#define BNB_CC_VOLTA_XAVIER 720
12-
#define BNB_CC_TURING 750
13-
#define BNB_CC_AMPERE 800
14-
#define BNB_CC_AMPERE2 860
15-
#define BNB_CC_AMPERE2_ORIN 870
16-
#define BNB_CC_ADA 890
17-
#define BNB_CC_HOPPER 900
18-
#define BNB_CC_BLACKWELL 1000
5+
#define BNB_CC_MAXWELL 500
6+
#define BNB_CC_MAXWELL2 520
7+
#define BNB_CC_MAXWELL2_X1 530
8+
#define BNB_CC_PASCAL 600
9+
#define BNB_CC_PASCAL_X2 620
10+
#define BNB_CC_VOLTA 700
11+
#define BNB_CC_VOLTA_XAVIER 720
12+
#define BNB_CC_TURING 750
13+
#define BNB_CC_AMPERE 800
14+
#define BNB_CC_AMPERE2 860
15+
#define BNB_CC_AMPERE2_ORIN 870
16+
#define BNB_CC_ADA 890
17+
#define BNB_CC_HOPPER 900
18+
#define BNB_CC_BLACKWELL 1000
1919

20-
#define BNB_FP16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_MAXWELL2_X1)
21-
#define BNB_FP16_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA)
22-
#define BNB_INT8_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA_XAVIER)
23-
#define BNB_BF16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_AMPERE)
24-
#define BNB_FP8_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_ADA)
20+
#define BNB_FP16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_MAXWELL2_X1)
21+
#define BNB_FP16_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA)
22+
#define BNB_INT8_MMA_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_VOLTA_XAVIER)
23+
#define BNB_BF16_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_AMPERE)
24+
#define BNB_FP8_AVAILABLE (__CUDA_ARCH__ >= BNB_CC_ADA)
2525

26-
#define BNB_WARP_SIZE 32
26+
#define BNB_WARP_SIZE 32
2727

2828
// The maximum number of resident threads per SM varies by arch.
2929
// For A100/H100 and all prior to Turing, it is 2048, which allows
3030
// for 2 full blocks of 1024 threads per SM.
31-
// Reference: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
31+
// Reference:
32+
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
3233
#if __CUDA_ARCH__ == 750
33-
#define BNB_MAX_THREADS_PER_SM 1024
34+
#define BNB_MAX_THREADS_PER_SM 1024
3435
#elif __CUDA_ARCH__ >= 860 && __CUDA_ARCH__ <= 890
35-
#define BNB_MAX_THREADS_PER_SM 1536
36+
#define BNB_MAX_THREADS_PER_SM 1536
3637
#else
37-
#define BNB_MAX_THREADS_PER_SM 2048
38+
#define BNB_MAX_THREADS_PER_SM 2048
3839
#endif
3940

4041
// Maximum resident warps per SM is always directly related to the number of threads.
41-
#define BNB_MAX_WARPS_PER_SM ((BNB_MAX_THREADS_PER_SM) / (BNB_WARP_SIZE))
42+
#define BNB_MAX_WARPS_PER_SM ((BNB_MAX_THREADS_PER_SM) / (BNB_WARP_SIZE))
4243

4344
// Maximum resident blocks per SM may vary.
4445
#if __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870
45-
#define BNB_MAX_BLOCKS_PER_SM 16
46+
#define BNB_MAX_BLOCKS_PER_SM 16
4647
#else
47-
#define BNB_MAX_BLOCKS_PER_SM ((BNB_MAX_WARPS_PER_SM) / 2)
48+
#define BNB_MAX_BLOCKS_PER_SM ((BNB_MAX_WARPS_PER_SM) / 2)
4849
#endif

csrc/common.h

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,18 @@
55

66
using namespace BinSearch;
77

8-
#define BLOCK_SIZE 16384
9-
108
struct quantize_block_args {
11-
BinAlgo<Scalar, float, Direct2> *bin_searcher;
12-
float *code;
13-
float *A;
14-
float *absmax;
15-
unsigned char *out;
9+
BinAlgo<Scalar, float, Direct2>* bin_searcher;
10+
float* code;
11+
float* A;
12+
float* absmax;
13+
unsigned char* out;
1614
long long block_end;
1715
long long block_idx;
1816
long long threadidx;
19-
long long blocksize;
17+
long long blocksize;
2018
};
2119

22-
2320
void quantize_block(const quantize_block_args& args);
2421

2522
#endif

csrc/cpu_ops.cpp

Lines changed: 32 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
using namespace BinSearch;
66

7-
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n) {
7+
void dequantize_cpu(float* code, unsigned char* A, float* absmax, float* out, long long blocksize, long long n) {
88
for (long long block_idx = 0; block_idx < n; block_idx += blocksize) {
99
long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
1010
long long block_end = block_idx + valid_items;
@@ -13,8 +13,7 @@ void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, lo
1313
}
1414
}
1515

16-
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n)
17-
{
16+
void quantize_cpu(float* code, float* A, float* absmax, unsigned char* out, long long blocksize, long long n) {
1817

1918
// the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below
2019
code[0] = -1.0f;
@@ -28,36 +27,35 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
2827
int thread_wave_size = 256;
2928
// we chunk the threads into waves of 256 since the max limit is
3029
// between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size)
31-
for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
32-
{
33-
long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
34-
std::vector<std::thread> threads(valid_chunks);
35-
std::vector<quantize_block_args> args(valid_chunks);
36-
37-
int chunks_processed = 0;
38-
for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize)
39-
{
40-
long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
41-
long long block_end = block_idx + valid_items;
42-
43-
struct quantize_block_args& arg = args[chunks_processed];
44-
arg.bin_searcher = &bin_searcher;
45-
arg.code = code;
46-
arg.A = A;
47-
arg.absmax = absmax;
48-
arg.out = out;
49-
arg.block_end = block_end;
50-
arg.block_idx = block_idx;
51-
arg.threadidx = block_idx / blocksize;
52-
arg.blocksize = blocksize;
53-
54-
threads[chunks_processed] = std::thread([arg] { quantize_block(arg); });
55-
chunks_processed += 1;
56-
if(chunks_processed == valid_chunks){ break; }
57-
}
58-
59-
for (int i = 0; i < valid_chunks; i++)
60-
threads[i].join();
30+
for (long long offset = 0; offset < num_blocks; offset += thread_wave_size) {
31+
long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
32+
std::vector<std::thread> threads(valid_chunks);
33+
std::vector<quantize_block_args> args(valid_chunks);
34+
35+
int chunks_processed = 0;
36+
for (long long block_idx = offset * blocksize; block_idx < n; block_idx += blocksize) {
37+
long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
38+
long long block_end = block_idx + valid_items;
39+
40+
struct quantize_block_args& arg = args[chunks_processed];
41+
arg.bin_searcher = &bin_searcher;
42+
arg.code = code;
43+
arg.A = A;
44+
arg.absmax = absmax;
45+
arg.out = out;
46+
arg.block_end = block_end;
47+
arg.block_idx = block_idx;
48+
arg.threadidx = block_idx / blocksize;
49+
arg.blocksize = blocksize;
50+
51+
threads[chunks_processed] = std::thread([arg] { quantize_block(arg); });
52+
chunks_processed += 1;
53+
if (chunks_processed == valid_chunks) {
54+
break;
55+
}
56+
}
57+
58+
for (int i = 0; i < valid_chunks; i++)
59+
threads[i].join();
6160
}
62-
6361
}

csrc/cpu_ops.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#include <iostream>
55
#include <stdio.h>
66

7-
void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n);
8-
void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n);
7+
void quantize_cpu(float* code, float* A, float* absmax, unsigned char* out, long long blocksize, long long n);
8+
void dequantize_cpu(float* code, unsigned char* A, float* absmax, float* out, long long blocksize, long long n);
99

1010
#endif

0 commit comments

Comments
 (0)