Skip to content

Commit ac5d6ee

Browse files
[docs] implement API docs (#1075)
* optims * fix path * fix path * mdx * fix path * toctree * fix * optimizer, adagrad * add init * add * more apis * params * clarify * run pre-commit hooks --------- Co-authored-by: Titus von Koeller <[email protected]>
1 parent 87e029b commit ac5d6ee

25 files changed

+1389
-44
lines changed

.git-blame-ignore-revs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ ea7c14f8ef64924f2d0ff80df3cdabf2c7299848
66

77
# Remove f-prefix from strings that don't use formatting
88
7727fa4c8c6c1ef2b109120aff4196a0a6bf3ed6
9+
10+
# format tests/linear_4bit.py
11+
34735ba89de8235ea9da6ef409f814dcea9e2038

bitsandbytes/nn/modules.py

Lines changed: 61 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,7 @@
2121

2222
class StableEmbedding(torch.nn.Embedding):
2323
"""
24-
Custom embedding layer designed for stable training in NLP tasks. The stable
25-
embedding layer improves stability during optimization for models with word
26-
embeddings, addressing issues related to the non-uniform distribution of input
27-
tokens.
28-
29-
This stable embedding layer is initialized with Xavier uniform initialization,
30-
followed by layer normalization. It is designed to support aggressive quantization,
31-
addressing extreme gradient variations in non-uniform input distributions. The
32-
stability of training is enhanced by using 32-bit optimizer states specifically
33-
for this layer.
24+
Custom embedding layer designed to improve stability during training for NLP tasks by using 32-bit optimizer states. It is designed to reduce gradient variations that can result from quantization. This embedding layer is initialized with Xavier uniform initialization followed by layer normalization.
3425
3526
Example:
3627
@@ -47,14 +38,11 @@ class StableEmbedding(torch.nn.Embedding):
4738
```
4839
4940
Attributes:
50-
norm (torch.nn.LayerNorm): Layer normalization applied after the embedding.
41+
norm (`torch.nn.LayerNorm`): Layer normalization applied after the embedding.
5142
5243
Methods:
5344
reset_parameters(): Reset embedding parameters using Xavier uniform initialization.
5445
forward(input: Tensor) -> Tensor: Forward pass through the stable embedding layer.
55-
56-
Reference:
57-
- [8-bit optimizer paper](https://arxiv.org/pdf/2110.02861.pdf)
5846
"""
5947
def __init__(
6048
self,
@@ -71,14 +59,22 @@ def __init__(
7159
) -> None:
7260
"""
7361
Args:
74-
num_embeddings (`int`): The number of unique embeddings (vocabulary size).
75-
embedding_dim (`int`): The dimensionality of the embedding.
76-
padding_idx (`Optional[int]`): If specified, pads the output with zeros at the given index.
77-
max_norm (`Optional[float]`): If given, renormalizes embeddings to have a maximum L2 norm.
78-
norm_type (`float`, defaults to `2.0`): The p-norm to compute for the max_norm option.
79-
scale_grad_by_freq (`bool`): Scale gradient by frequency during backpropagation.
80-
sparse (`bool`): If True, computes sparse gradients; False, computes dense gradients.
81-
_weight (`Optional[Tensor]`): Pre-trained embeddings.
62+
num_embeddings (`int`):
63+
The number of unique embeddings (vocabulary size).
64+
embedding_dim (`int`):
65+
The dimensionality of the embedding.
66+
padding_idx (`Optional[int]`):
67+
Pads the output with zeros at the given index.
68+
max_norm (`Optional[float]`):
69+
Renormalizes embeddings to have a maximum L2 norm.
70+
norm_type (`float`, defaults to `2.0`):
71+
The p-norm to compute for the `max_norm` option.
72+
scale_grad_by_freq (`bool`, defaults to `False`):
73+
Scale gradient by frequency during backpropagation.
74+
sparse (`bool`, defaults to `False`):
75+
Computes dense gradients. Set to `True` to compute sparse gradients instead.
76+
_weight (`Optional[Tensor]`):
77+
Pretrained embeddings.
8278
"""
8379
super().__init__(
8480
num_embeddings,
@@ -131,6 +127,9 @@ def forward(self, input: Tensor) -> Tensor:
131127

132128

133129
class Embedding(torch.nn.Embedding):
130+
"""
131+
Embedding class to store and retrieve word embeddings from their indices.
132+
"""
134133
def __init__(
135134
self,
136135
num_embeddings: int,
@@ -143,6 +142,25 @@ def __init__(
143142
_weight: Optional[Tensor] = None,
144143
device: Optional[device] = None,
145144
) -> None:
145+
"""
146+
Args:
147+
num_embeddings (`int`):
148+
The number of unique embeddings (vocabulary size).
149+
embedding_dim (`int`):
150+
The dimensionality of the embedding.
151+
padding_idx (`Optional[int]`):
152+
Pads the output with zeros at the given index.
153+
max_norm (`Optional[float]`):
154+
Renormalizes embeddings to have a maximum L2 norm.
155+
norm_type (`float`, defaults to `2.0`):
156+
The p-norm to compute for the `max_norm` option.
157+
scale_grad_by_freq (`bool`, defaults to `False`):
158+
Scale gradient by frequency during backpropagation.
159+
sparse (`bool`, defaults to `False`):
160+
Computes dense gradients. Set to `True` to compute sparse gradients instead.
161+
_weight (`Optional[Tensor]`):
162+
Pretrained embeddings.
163+
"""
146164
super().__init__(
147165
num_embeddings,
148166
embedding_dim,
@@ -416,7 +434,19 @@ def forward(self, x: torch.Tensor):
416434

417435

418436
class LinearFP4(Linear4bit):
437+
"""
438+
Implements the FP4 data type.
439+
"""
419440
def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_storage=torch.uint8, device=None):
441+
"""
442+
Args:
443+
input_features (`str`):
444+
Number of input features of the linear layer.
445+
output_features (`str`):
446+
Number of output features of the linear layer.
447+
bias (`bool`, defaults to `True`):
448+
Whether the linear class uses the bias term as well.
449+
"""
420450
super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4', quant_storage, device)
421451

422452

@@ -432,6 +462,15 @@ class LinearNF4(Linear4bit):
432462
the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
433463
'''
434464
def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_storage=torch.uint8, device=None):
465+
"""
466+
Args:
467+
input_features (`str`):
468+
Number of input features of the linear layer.
469+
output_features (`str`):
470+
Number of output features of the linear layer.
471+
bias (`bool`, defaults to `True`):
472+
Whether the linear class uses the bias term as well.
473+
"""
435474
super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'nf4', quant_storage, device)
436475

437476

bitsandbytes/optim/adagrad.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,33 @@ def __init__(
2020
percentile_clipping=100,
2121
block_wise=True,
2222
):
23+
"""
24+
Base Adagrad optimizer.
25+
26+
Arguments:
27+
params (`torch.tensor`):
28+
The input parameters to optimize.
29+
lr (`float`, defaults to 1e-2):
30+
The learning rate.
31+
lr_decay (`int`, defaults to 0):
32+
The learning rate decay.
33+
weight_decay (`float`, defaults to 0.0):
34+
The weight decay value for the optimizer.
35+
initial_accumulator_value (`int`, defaults to 0):
36+
The initial momemtum values.
37+
eps (`float`, defaults to 1e-10):
38+
The epsilon value prevents division by zero in the optimizer.
39+
optim_bits (`int`, defaults to 32):
40+
The number of bits of the optimizer state.
41+
args (`dict`, defaults to `None`):
42+
A dictionary with additional arguments.
43+
min_8bit_size (`int`, defaults to 4096):
44+
The minimum number of elements of the parameter tensors for 8-bit optimization.
45+
percentile_clipping (`int`, defaults to 100):
46+
Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
47+
block_wise (`bool`, defaults to `True`):
48+
Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
49+
"""
2350
if not 0.0 <= lr:
2451
raise ValueError(f"Invalid learning rate: {lr}")
2552
if not 0.0 <= weight_decay:
@@ -62,6 +89,33 @@ def __init__(
6289
percentile_clipping=100,
6390
block_wise=True,
6491
):
92+
"""
93+
8-bit Adagrad optimizer.
94+
95+
Arguments:
96+
params (`torch.tensor`):
97+
The input parameters to optimize.
98+
lr (`float`, defaults to 1e-2):
99+
The learning rate.
100+
lr_decay (`int`, defaults to 0):
101+
The learning rate decay.
102+
weight_decay (`float`, defaults to 0.0):
103+
The weight decay value for the optimizer.
104+
initial_accumulator_value (`int`, defaults to 0):
105+
The initial momemtum values.
106+
eps (`float`, defaults to 1e-10):
107+
The epsilon value prevents division by zero in the optimizer.
108+
optim_bits (`int`, defaults to 8):
109+
The number of bits of the optimizer state.
110+
args (`dict`, defaults to `None`):
111+
A dictionary with additional arguments.
112+
min_8bit_size (`int`, defaults to 4096):
113+
The minimum number of elements of the parameter tensors for 8-bit optimization.
114+
percentile_clipping (`int`, defaults to 100):
115+
Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
116+
block_wise (`bool`, defaults to `True`):
117+
Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
118+
"""
65119
if not 0.0 <= lr:
66120
raise ValueError(f"Invalid learning rate: {lr}")
67121
if not 0.0 <= weight_decay:
@@ -105,6 +159,33 @@ def __init__(
105159
percentile_clipping=100,
106160
block_wise=True,
107161
):
162+
"""
163+
32-bit Adagrad optimizer.
164+
165+
Arguments:
166+
params (`torch.tensor`):
167+
The input parameters to optimize.
168+
lr (`float`, defaults to 1e-2):
169+
The learning rate.
170+
lr_decay (`int`, defaults to 0):
171+
The learning rate decay.
172+
weight_decay (`float`, defaults to 0.0):
173+
The weight decay value for the optimizer.
174+
initial_accumulator_value (`int`, defaults to 0):
175+
The initial momemtum values.
176+
eps (`float`, defaults to 1e-10):
177+
The epsilon value prevents division by zero in the optimizer.
178+
optim_bits (`int`, defaults to 32):
179+
The number of bits of the optimizer state.
180+
args (`dict`, defaults to `None`):
181+
A dictionary with additional arguments.
182+
min_8bit_size (`int`, defaults to 4096):
183+
The minimum number of elements of the parameter tensors for 8-bit optimization.
184+
percentile_clipping (`int`, defaults to 100):
185+
Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
186+
block_wise (`bool`, defaults to `True`):
187+
Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
188+
"""
108189
if not 0.0 <= lr:
109190
raise ValueError(f"Invalid learning rate: {lr}")
110191
if not 0.0 <= weight_decay:

0 commit comments

Comments
 (0)