Skip to content

Commit 63f538a

Browse files
Remove deprecated code (#1798)
1 parent a2cb49b commit 63f538a

File tree

13 files changed

+0
-467
lines changed

13 files changed

+0
-467
lines changed

bitsandbytes/autograd/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
from ._functions import get_inverse_transform_indices, undo_layout

bitsandbytes/autograd/_functions.py

Lines changed: 0 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
from collections.abc import Callable
21
from dataclasses import dataclass
32
from math import prod
43
from typing import Optional
54
import warnings
65
from warnings import warn
76

87
import torch
9-
from typing_extensions import deprecated
108

119
import bitsandbytes.functional as F
1210

@@ -50,66 +48,9 @@ def get_current_outlier_idx(self):
5048
return torch.Tensor(list(self.outliers)).to(torch.int64)
5149

5250

53-
@deprecated(
54-
"This function is deprecated and will be removed in a future release.",
55-
category=FutureWarning,
56-
)
57-
def get_inverse_transform_indices(
58-
transform_tile: Callable[[torch.Tensor], torch.Tensor],
59-
tile_size: tuple[int, int],
60-
):
61-
"""
62-
Compute a permutation of indices that invert the specified (tiled) matrix transformation
63-
64-
:param transform_tile: a function that applies forward transform to a tensor of shape [dim1, dim2]
65-
:param tile_size: higher-level tile dimensions, i.e. (8, 32) for Turing and (32, 32) for Ampere
66-
:note: we assume that tile_transform applies to a cpu-based int8 tensor of shape tile_size
67-
:example: transform_tile function for the turing layout (bitsandbytes.functional as F)
68-
:returns: indices
69-
"""
70-
d1, d2 = tile_size
71-
assert 0 < d1 * d2 < 2**64
72-
tile_indices = torch.arange(d1 * d2, dtype=torch.int64).view(d1, d2)
73-
# encode each position in tile as a tuple of <= 8 unique bytes
74-
permuted_tile_indices = torch.zeros_like(tile_indices)
75-
for i in range(8):
76-
# select i-th byte, apply transformation and trace where each index ended up
77-
ith_dim_indices = torch.div(tile_indices, 256**i, rounding_mode="trunc") % 256
78-
sample_tile_i = (ith_dim_indices - 128).to(torch.int8).contiguous()
79-
assert torch.all(sample_tile_i.int() + 128 == ith_dim_indices), "int overflow"
80-
permuted_tile_i = transform_tile(sample_tile_i)
81-
ith_permuted_indices = permuted_tile_i.to(tile_indices.dtype) + 128
82-
permuted_tile_indices += ith_permuted_indices * (256**i)
83-
if d1 * d2 < 256**i:
84-
break # if all indices fit in i bytes, stop early
85-
return permuted_tile_indices
86-
87-
8851
_is_compiling = torch.compiler.is_compiling
8952

9053

91-
@deprecated(
92-
"This function is deprecated and will be removed in a future release.",
93-
category=FutureWarning,
94-
)
95-
def undo_layout(permuted_tensor: torch.Tensor, tile_indices: torch.LongTensor) -> torch.Tensor:
96-
"""
97-
Undo a tiled permutation such as turing or ampere layout
98-
99-
:param permuted_tensor: torch tensor in a permuted layout
100-
:param tile_indices: reverse transformation indices, from get_inverse_transform_indices
101-
:return: contiguous row-major tensor
102-
"""
103-
(rows, cols), (tile_rows, tile_cols) = permuted_tensor.shape, tile_indices.shape
104-
assert rows % tile_rows == cols % tile_cols == 0, "tensor must contain a whole number of tiles"
105-
tensor = permuted_tensor.reshape(-1, tile_indices.numel()).t()
106-
outputs = torch.empty_like(tensor) # note: not using .index_copy because it was slower on cuda
107-
outputs[tile_indices.flatten()] = tensor
108-
outputs = outputs.reshape(tile_rows, tile_cols, cols // tile_cols, rows // tile_rows)
109-
outputs = outputs.permute(3, 0, 2, 1) # (rows // tile_rows, tile_rows), (cols // tile_cols, tile_cols)
110-
return outputs.reshape(rows, cols).contiguous()
111-
112-
11354
@dataclass
11455
class MatmulLtState:
11556
_tile_indices: Optional[torch.Tensor] = None # TODO: remove

bitsandbytes/functional.py

Lines changed: 0 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1795,102 +1795,6 @@ def int8_mm_dequant(
17951795
return result
17961796

17971797

1798-
@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
1799-
def get_colrow_absmax(
1800-
A: torch.Tensor,
1801-
row_stats: Optional[torch.Tensor] = None,
1802-
col_stats: Optional[torch.Tensor] = None,
1803-
nnz_block_ptr: Optional[torch.Tensor] = None,
1804-
threshold=0.0,
1805-
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
1806-
""" "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.
1807-
1808-
The row-wise and column-wise absmax values are determined.
1809-
1810-
For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).
1811-
1812-
<Tip>
1813-
This function is useful for training, but for inference it is advised to use [`get_row_absmax`] instead.
1814-
The column-wise quantization scales are not typically needed in inference scenarios.
1815-
</Tip>
1816-
1817-
Args:
1818-
A (`torch.Tensor` with dtype `torch.float16`): Input tensor.
1819-
row_stats (`torch.Tensor`, *optional*): If provided, calculation of row statistics is skipped.
1820-
col_stats (`torch.Tensor`, *optional*): If provided, calculation of column statistics is skipped.
1821-
nnz_block_ptr (`torch.Tensor`, *optional*): Not used.
1822-
threshold (`float`, *optional*):
1823-
An optional threshold for sparse decomposition of outlier features.
1824-
No outliers are held back when 0.0. Defaults to 0.0.
1825-
1826-
Returns:
1827-
`Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing quantization statistics.
1828-
- `torch.Tensor` with dtype `torch.float32`: The row-wise quantization statistics.
1829-
- `torch.Tensor` with dtype `torch.float32`: The column-wise quantization statistics.
1830-
- `torch.Tensor` with dtype `torch.bool`, *optional*: A mask indicating the locations of outliers in the input tensor.
1831-
"""
1832-
assert A.is_floating_point()
1833-
1834-
outlier_mask = None
1835-
1836-
if row_stats is None or col_stats is None:
1837-
absA = A.abs().view(-1, A.shape[-1])
1838-
1839-
if threshold > 0.0:
1840-
# Filter outliers from stats when enabled
1841-
outlier_mask = absA >= threshold
1842-
absA.masked_fill_(outlier_mask, 0.0)
1843-
1844-
if row_stats is None:
1845-
# shape [rows]; unsqueeze(-1) gives [rows,1]
1846-
# We have a CUDA kernel for row max, but not yet for cols.
1847-
row_stats = get_row_absmax(A, threshold)
1848-
1849-
if col_stats is None:
1850-
# shape [cols]; unsqueeze(0) gives [1,cols]
1851-
col_stats = absA.amax(dim=0, keepdim=False).float()
1852-
1853-
return row_stats, col_stats, outlier_mask
1854-
1855-
1856-
@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
1857-
def get_row_absmax(A: torch.Tensor, threshold=0.0):
1858-
"""Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.
1859-
1860-
For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).
1861-
1862-
Args:
1863-
A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
1864-
threshold (`float`, *optional*):
1865-
An optional threshold for sparse decomposition of outlier features.
1866-
No outliers are held back when 0.0. Defaults to 0.0.
1867-
1868-
Returns:
1869-
`torch.Tensor` with dtype `torch.float32`: The absolute maximum value for each row, with outliers ignored.
1870-
"""
1871-
1872-
assert A.dtype == torch.float16
1873-
1874-
rows = prod(A.shape[:-1])
1875-
cols = A.shape[-1]
1876-
1877-
row_stats = torch.empty((rows,), dtype=torch.float32, device=A.device)
1878-
1879-
is_on_gpu([A])
1880-
1881-
with _cuda_device_of(A):
1882-
lib.cget_row_stats(
1883-
get_ptr(A),
1884-
get_ptr(row_stats),
1885-
ct.c_float(threshold),
1886-
ct.c_int32(rows),
1887-
ct.c_int32(cols),
1888-
_get_tensor_stream(A),
1889-
)
1890-
1891-
return row_stats
1892-
1893-
18941798
class COOSparseTensor:
18951799
def __init__(
18961800
self, rows: int, cols: int, nnz: int, rowidx: torch.Tensor, colidx: torch.Tensor, values: torch.Tensor

csrc/kernels.cu

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1825,51 +1825,6 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
18251825
}
18261826
}
18271827

1828-
template <typename T, int THREADS, int SPARSE_DECOMP>
1829-
__launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
1830-
void kgetRowStats(T* __restrict__ A, float* rowStats, float threshold, int rows, int cols) {
1831-
using BlockReduceT = cub::BlockReduce<float, THREADS>;
1832-
1833-
// One block per row.
1834-
// Threads load column values in a striped arrangement.
1835-
// e.g. t0 reads row[0], row[0+nthreads], ..
1836-
// and t1 reads row[1], row[1+nthreads], ..
1837-
// Each thread will determine its local absmax.
1838-
// We then do a blockwise reduction to determine the row's absmax.
1839-
1840-
__shared__ typename BlockReduceT::TempStorage temp_storage;
1841-
1842-
const int row_id = blockIdx.x;
1843-
const T* __restrict__ row_data = A + (row_id * cols);
1844-
1845-
// Threads will read the row values in a striped access pattern and find a local absmax.
1846-
float row_local_absmax = -FLT_MIN;
1847-
for (int i = threadIdx.x; i < cols; i += THREADS) {
1848-
const float absval = fabsf(row_data[i]);
1849-
1850-
// For sparse decomposition, values outside of the threshold are not to be
1851-
// included when calculating the row's absmax.
1852-
if constexpr (SPARSE_DECOMP) {
1853-
row_local_absmax = fmaxf(row_local_absmax, absval < threshold ? absval : row_local_absmax);
1854-
} else {
1855-
row_local_absmax = fmaxf(row_local_absmax, absval);
1856-
}
1857-
}
1858-
1859-
// Reduce thread-local absmax across the block.
1860-
// TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
1861-
const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
1862-
if (threadIdx.x == 0) {
1863-
// Save our block's absmax to shared memory for the quantization step.
1864-
rowStats[row_id] = row_absmax;
1865-
}
1866-
}
1867-
1868-
template __global__ void
1869-
kgetRowStats<half, 1024, 0>(half* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
1870-
template __global__ void
1871-
kgetRowStats<half, 1024, 1>(half* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
1872-
18731828
template __global__ void kInt8VectorQuant<half, 1024, 0>(
18741829
half* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols
18751830
);

csrc/kernels.cuh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,9 @@ __global__ void kdequant_mm_int32_fp16(
109109
half* __restrict__ const bias, const int numRows, const int numCols, const int n
110110
);
111111

112-
template <typename T, int THREADS, int SPARSE_DECOMP>
113-
__global__ void kgetRowStats(T* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
114112
template <typename T, int THREADS, int SPARSE_DECOMP>
115113
__global__ void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols);
116114

117-
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT>
118-
__global__ void kTransformRowToFormat(
119-
char* __restrict__ const A, char* out, int rows, int cols, int tiledCols, int outRows, int outCols
120-
);
121-
122115
template <typename T, int BITS, int THREADS>
123116
__global__ void gemm_device(int M, int N, int K, T* __restrict__ const A, T* B, T* out, int lda, int ldb, int ldc);
124117
template <typename T, int THREADS>

csrc/kernels.hip

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1946,49 +1946,6 @@ __global__ void kInt8VectorQuant(T * __restrict__ A, int8_t* out, float* rowStat
19461946
}
19471947
}
19481948

1949-
template<typename T, int THREADS, int SPARSE_DECOMP>
1950-
__launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024)
1951-
__global__ void kgetRowStats(T * __restrict__ A, float *rowStats, float threshold, int rows, int cols) {
1952-
using BlockReduceT = hipcub::BlockReduce<float, THREADS>;
1953-
1954-
// One block per row.
1955-
// Threads load column values in a striped arrangement.
1956-
// e.g. t0 reads row[0], row[0+nthreads], ..
1957-
// and t1 reads row[1], row[1+nthreads], ..
1958-
// Each thread will determine its local absmax.
1959-
// We then do a blockwise reduction to determine the row's absmax.
1960-
1961-
__shared__ typename BlockReduceT::TempStorage temp_storage;
1962-
1963-
const int row_id = blockIdx.x;
1964-
const T* __restrict__ row_data = A + (row_id * cols);
1965-
1966-
// Threads will read the row values in a striped access pattern and find a local absmax.
1967-
float row_local_absmax = -FLT_MIN;
1968-
for (int i = threadIdx.x; i < cols; i += THREADS) {
1969-
const float absval = fabsf(row_data[i]);
1970-
1971-
// For sparse decomposition, values outside of the threshold are not to be
1972-
// included when calculating the row's absmax.
1973-
if constexpr (SPARSE_DECOMP) {
1974-
row_local_absmax = fmaxf(row_local_absmax, absval < threshold ? absval : row_local_absmax);
1975-
} else {
1976-
row_local_absmax = fmaxf(row_local_absmax, absval);
1977-
}
1978-
}
1979-
1980-
// Reduce thread-local absmax across the block.
1981-
// TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
1982-
const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, hipcub::Max(), cols);
1983-
if (threadIdx.x == 0) {
1984-
// Save our block's absmax to shared memory for the quantization step.
1985-
rowStats[row_id] = row_absmax;
1986-
}
1987-
}
1988-
1989-
template __global__ void kgetRowStats<half, 1024, 0>(half * __restrict__ A, float *rowStats, float threshold, int rows, int cols);
1990-
template __global__ void kgetRowStats<half, 1024, 1>(half * __restrict__ A, float *rowStats, float threshold, int rows, int cols);
1991-
19921949
template __global__ void kInt8VectorQuant<half, 1024, 0>(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols);
19931950
template __global__ void kInt8VectorQuant<half, 1024, 1>(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols);
19941951

csrc/kernels_hip.cuh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,16 +111,9 @@ __global__ void kdequant_mm_int32_fp16(
111111
half* __restrict__ const bias, const int numRows, const int numCols, const int n
112112
);
113113

114-
template <typename T, int THREADS, int SPARSE_DECOMP>
115-
__global__ void kgetRowStats(T* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
116114
template <typename T, int THREADS, int SPARSE_DECOMP>
117115
__global__ void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols);
118116

119-
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT>
120-
__global__ void kTransformRowToFormat(
121-
char* __restrict__ const A, char* out, int rows, int cols, int tiledCols, int outRows, int outCols
122-
);
123-
124117
template <typename T, int BITS, int THREADS>
125118
__global__ void gemm_device(int M, int N, int K, T* __restrict__ const A, T* B, T* out, int lda, int ldb, int ldc);
126119
template <typename T, int THREADS>

0 commit comments

Comments
 (0)