Cleanup some deprecated functions.

matthewdouglas · matthewdouglas · commit 2813571d9feb · 2025-02-12T15:18:21.000-05:00
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -431,11 +431,6 @@ def create_quantile_map(A, total_bits=8):
     return q
 
 
-@deprecated("This function is deprecated and will be removed in a future version.", category=FutureWarning)
-def get_special_format_str():
-    return "row"
-
-
 def is_on_gpu(tensors: Iterable[Optional[torch.Tensor]]):
     """Verifies that the input tensors are all on the same device.
 
@@ -472,11 +467,6 @@ def is_on_gpu(tensors: Iterable[Optional[torch.Tensor]]):
     return on_gpu
 
 
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def get_tensor_stream(tensor: Tensor) -> torch.cuda.Stream:
-    return torch.cuda.current_stream(tensor.device)
-
-
 def _get_tensor_stream(tensor: Tensor) -> ct.c_void_p:
     # We use the raw stream for performance reasons.
     return ct.c_void_p(torch._C._cuda_getCurrentRawStream(tensor.device.index))
@@ -2251,27 +2241,6 @@ def batched_igemm(
     return out
 
 
-@deprecated(
-    "igemmlt is deprecated and will be removed in a future release. Please use int8_linear_matmul instead.",
-    category=FutureWarning,
-)
-def igemmlt(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    SA: Tuple[torch.Size, str],
-    SB: Tuple[torch.Size, str],
-    out: Optional[torch.Tensor] = None,
-    Sout: Optional[Tuple[torch.Size, str]] = None,
-    dtype=torch.int32,
-):
-    if SA is not None and SA[1] != "row":
-        raise NotImplementedError(f"Only row-major format inputs are supported, but got format `{SA[1]}`")
-    if SB is not None and SB[1] != "row":
-        raise NotImplementedError(f"Only row-major format is supported for matrix B, but got format `{SB[1]}`")
-    result = int8_linear_matmul(A, B, out=out, dtype=dtype)
-    return result, (result.shape, "row")
-
-
 def int8_linear_matmul(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None, dtype=torch.int32):
     """Performs an 8-bit integer matrix multiplication.
 
@@ -2316,20 +2285,6 @@ def int8_mm_dequant(
     return torch.ops.bitsandbytes.int8_mm_dequant(A, row_stats, col_stats, out, bias)
 
 
-@deprecated("mm_dequant is deprecated. Please use int8_mm_dequant() instead.", category=FutureWarning)
-def mm_dequant(
-    A: torch.Tensor,
-    quant_state: Optional[Tuple[torch.Size, str]],  # Not used
-    row_stats: torch.Tensor,
-    col_stats: torch.Tensor,
-    out: Optional[torch.Tensor] = None,
-    new_row_stats=None,  # Not used
-    new_col_stats=None,  # Not used
-    bias: Optional[torch.Tensor] = None,
-):
-    return int8_mm_dequant(A, row_stats, col_stats, out, bias)
-
-
 def get_colrow_absmax(
     A: torch.Tensor,
     row_stats: Optional[torch.Tensor] = None,
@@ -2505,72 +2460,6 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
     return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
 
 
-@deprecated("This function is deprecated. Please use `int8_double_quant` instead.", category=FutureWarning)
-def double_quant(
-    A: torch.Tensor,
-    col_stats: Optional[torch.Tensor] = None,
-    row_stats: Optional[torch.Tensor] = None,
-    out_col: Optional[torch.Tensor] = None,
-    out_row: Optional[torch.Tensor] = None,
-    threshold=0.0,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[COOSparseTensor]]:
-    """Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.
-
-    The statistics are determined both row-wise and column-wise (transposed).
-
-    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).
-
-    <Tip warning={true}>
-    This function exists for backwards compatibility only. It is advised to use [`int8_double_quant`] instead.
-    The difference is that this function will return a [`COOSparseTensor`] for outliers instead of a column index.
-    </Tip>
-
-    Args:
-        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
-        col_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantization scales.
-        row_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantization scales.
-        out_col (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantized data.
-        out_row (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantized data.
-        threshold (`float`, *optional*):
-            An optional threshold for sparse decomposition of outlier features.
-
-            No outliers are held back when 0.0. Defaults to 0.0.
-
-    Returns:
-        `Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
-        - `torch.Tensor` with dtype `torch.int8`: The row-wise quantized data.
-        - `torch.Tensor` with dtype `torch.int8`: The column-wise quantized data.
-        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization scales.
-        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization scales.
-        - `COOSparseTensor`, *optional*: A structure representing the outlier values from the input tensor.
-    """
-
-    coo_tensor = None
-    quant_row, quant_col, row_stats, col_stats, outlier_cols = int8_double_quant(
-        A,
-        col_stats,
-        row_stats,
-        out_col,
-        out_row,
-        threshold=threshold,
-    )
-
-    if threshold > 0.0 and outlier_cols is not None:
-        # Build a COO tensor including all of the outlier columns.
-        outlier_rows = torch.arange(0, A.shape[0], device=A.device, dtype=torch.int32)
-        outliers = A[:, outlier_cols]
-        coo_tensor = COOSparseTensor(
-            A.shape[0],
-            A.shape[1],
-            outliers.numel(),
-            outlier_rows.repeat_interleave(outliers.size(1)),
-            outlier_cols.repeat(outliers.size(0)).int(),
-            outliers,
-        )
-
-    return quant_row, quant_col, row_stats, col_stats.flatten().float(), coo_tensor
-
-
 def int8_double_quant(
     A: torch.Tensor,
     col_stats: Optional[torch.Tensor] = None,
@@ -2992,21 +2881,6 @@ def vectorwise_mm_dequant(xq, S1, S2, dtype=torch.half, quant_type="vector"):
         return None
 
 
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
-    offset = B.float().t().sum(0) * (SA[0] + SA[1])
-    x = xq.float()
-    if len(xq.shape) == 2 and len(SB.shape) == 3:
-        SB = SB.squeeze(0)
-    if len(SB.shape) == 2:
-        x *= SB.t() / 127
-    else:
-        x *= SB / 127
-    x *= SA[1] / 127
-    x += offset
-    return x.to(dtype)
-
-
 @deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
 def extract_outliers(A, SA, idx):
     shapeA = SA[0]
@@ -3031,10 +2905,3 @@ def extract_outliers(A, SA, idx):
     post_call(prev_device)
 
     return out
-
-
-@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
-def pipeline_test(A, batch_size):
-    out = torch.zeros_like(A)
-    lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
-    return out