Skip to content
This repository was archived by the owner on Sep 9, 2025. It is now read-only.

Commit 5fca4cc

Browse files
authored
Update for Recent Changes and Granite Model Class Support (#11)
* Update for granite model class support Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com> * Add mixins Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com> * Removing rmsnorm options to avoid optional checks Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com> * Remove TP import Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com> * Add config init Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com> * Remove granite moe Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com> * Remove mixtral Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com> * Remove excess register stuff Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com> --------- Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
1 parent da678a5 commit 5fca4cc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+3055
-1836
lines changed

src/instructlab/dolomite/enums.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,83 @@ class ParamsGroupMethod(Enum):
1111

1212
class GradientCheckpointingMethod(Enum):
1313
block = "block"
14+
15+
16+
class LRDecaySchedule(str, Enum):
17+
constant = "constant"
18+
cosine = "cosine"
19+
exponential = "exponential"
20+
linear = "linear"
21+
power = "power"
22+
23+
24+
class AttentionImplementation(Enum):
25+
"""
26+
Enum class for attention implementation
27+
"""
28+
29+
eager = "eager"
30+
sdpa = "sdpa"
31+
flash_attention_2 = "flash_attention_2"
32+
33+
34+
class MoEImplementation(Enum):
35+
"""
36+
Enum class for MoE implementation
37+
"""
38+
39+
eager = "eager"
40+
scattermoe = "scattermoe"
41+
42+
43+
class DatasetSplit(str, Enum):
44+
"""dataset split"""
45+
46+
train = "train"
47+
val = "val"
48+
test = "test"
49+
50+
51+
class Mode(str, Enum):
52+
"""training / inference mode"""
53+
54+
training = "training"
55+
inference = "inference"
56+
unsharding = "unsharding"
57+
distillation = "distillation"
58+
59+
60+
class TuningMethod(str, Enum):
61+
"""training method"""
62+
63+
pretraining = "pretraining"
64+
full_finetuning = "full_finetuning"
65+
prompt_tuning = "prompt_tuning"
66+
lora = "lora"
67+
distillation = "distillation"
68+
69+
70+
class FP8Backend(str, Enum):
71+
msamp = "msamp"
72+
nvte = "nvte"
73+
74+
75+
class LossMask(str, Enum):
76+
"""Type of loss masking method"""
77+
78+
output_only = "output_only"
79+
no_mask = "no_mask"
80+
81+
82+
class KLDivergenceMethod(str, Enum):
83+
"""Type of KL divergence"""
84+
85+
forward = "forward"
86+
backward = "backward"
87+
88+
89+
class ExperimentsTrackerName(str, Enum):
90+
"""Experiment tracker to use"""
91+
92+
aim = "aim"
93+
wandb = "wandb"

src/instructlab/dolomite/hf_models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Extracted from https://github.com/ibm-granite/dolomite-engine
33
# ----------------------------------------------------------------
44
# Local
5-
from .config import GPTDolomiteConfig
5+
from .models.gpt_dolomite.config import GPTDolomiteConfig
66
from .model_conversion import export_to_huggingface, import_from_huggingface
77
from .models import GPTDolomiteForCausalLM, GPTDolomiteModel
88
from .register_hf import register_model_classes

src/instructlab/dolomite/hf_models/config.py

Lines changed: 12 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,9 @@
1-
# ----------------------------------------------------------------
2-
# Extracted from https://github.com/ibm-granite/dolomite-engine
3-
# ----------------------------------------------------------------
4-
# Third Party
51
from transformers import PretrainedConfig
62

7-
# Local
8-
from .enums import AttentionHeadType, PositionEmbeddingType
3+
from .enums import AttentionHeadType, InitMethod, PositionEmbeddingType
94

105

11-
class GPTDolomiteConfig(PretrainedConfig):
12-
model_type = "gpt_dolomite"
6+
class CommonConfig(PretrainedConfig):
137
keys_to_ignore_at_inference = ["past_key_values"]
148
attribute_map = {
159
"hidden_size": "n_embd",
@@ -18,20 +12,15 @@ class GPTDolomiteConfig(PretrainedConfig):
1812
"num_hidden_layers": "n_layer",
1913
}
2014

21-
# NOTE: initializer range is kept for backward compatiblity
22-
# but it is not used anymore
23-
# : also rope_scaling is not used anymore but kept for
24-
# same reason.
25-
2615
def __init__(
2716
self,
2817
vocab_size: int = 50257,
2918
n_positions: int = 1024,
3019
n_embd: int = 768,
3120
n_layer: int = 12,
3221
n_head: int = 12,
33-
num_key_value_heads: int = None,
34-
n_inner: int = None,
22+
num_key_value_heads: int | None = None,
23+
n_inner: int | None = None,
3524
activation_function: str = "gelu_pytorch_tanh",
3625
attention_head_type: str = "mqa",
3726
resid_pdrop: float = 0.1,
@@ -41,20 +30,19 @@ def __init__(
4130
layer_norm_epsilon: float = 1e-5,
4231
initializer_range: float = 0.02,
4332
scale_attn_weights: bool = True,
44-
attention_multiplier: float = None,
33+
attention_multiplier: float | None = None,
4534
use_cache: bool = True,
4635
bos_token_id: int = 50256,
4736
eos_token_id: int = 50256,
4837
pad_token_id: int = 50256,
4938
attention_softmax_in_fp32: bool = True,
50-
scale_attention_softmax_in_fp32: bool = True,
5139
add_bias: bool = True,
5240
position_embedding_type: str = "learned_absolute",
5341
rope_theta: int = 10000,
54-
rope_scaling: dict = None,
55-
m_emb: float = None,
56-
m_width: float = None,
57-
m_residual: float = None,
42+
rope_scaling: dict | None = None,
43+
m_emb: float | None = None,
44+
m_width: float | None = None,
45+
m_residual: float | None = None,
5846
init_method: str = "normal",
5947
upcast_logits_for_loss: bool = False,
6048
**kwargs,
@@ -78,7 +66,6 @@ def __init__(
7866
self.attention_multiplier = attention_multiplier
7967
self.use_cache = use_cache
8068
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
81-
self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
8269
self.position_embedding_type = position_embedding_type
8370
self.add_bias = add_bias
8471
self.rope_theta = rope_theta
@@ -93,6 +80,7 @@ def __init__(
9380
assert self.scale_attn_weights
9481

9582
# check if enums are valid
83+
init_method = InitMethod(init_method)
9684
attention_head_type = AttentionHeadType(attention_head_type)
9785
position_embedding_type = PositionEmbeddingType(position_embedding_type)
9886

@@ -110,9 +98,7 @@ def __init__(
11098
if self.num_key_value_heads is None:
11199
self.num_key_value_heads = 1
112100

113-
assert (
114-
self.num_key_value_heads == 1
115-
), "MultiQueryAttention should have 1 head for keys and values"
101+
assert self.num_key_value_heads == 1, "MultiQueryAttention should have 1 head for keys and values"
116102
elif attention_head_type == AttentionHeadType.gqa:
117103
assert (
118104
self.num_key_value_heads is not None
@@ -122,9 +108,4 @@ def __init__(
122108
self.n_head % self.num_key_value_heads == 0
123109
), "GroupedQueryAttention should have more than 1 head for keys and values"
124110

125-
super().__init__(
126-
bos_token_id=bos_token_id,
127-
eos_token_id=eos_token_id,
128-
pad_token_id=pad_token_id,
129-
**kwargs,
130-
)
111+
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
DEFAULT_NORMALIZATION_IMPLEMENTATION = "torch"

src/instructlab/dolomite/hf_models/enums.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
# ----------------------------------------------------------------
2-
# Extracted from https://github.com/ibm-granite/dolomite-engine
3-
# ----------------------------------------------------------------
4-
# Standard
51
from enum import Enum
62

73

4+
class InitMethod(Enum):
5+
normal = "normal"
6+
mup = "mup"
7+
8+
89
class PositionEmbeddingType(Enum):
910
"""
1011
Enum class for position embeddings
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .dense import BaseModelMixin, CausalLMModelMixin, PreTrainedModelMixin
2+
#from .dense_TP import BaseModelMixin_TP, CausalLMModelMixin_TP, PreTrainedModelMixin_TP
3+
from .moe import BaseMoEModelMixin, CausalLMMoEModelMixin, PreTrainedMoEModelMixin
4+
#from .moe_TP import BaseMoEModelMixin_TP, CausalLMMoEModelMixin_TP, PreTrainedMoEModelMixin_TP
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .base import BaseModelMixin, PreTrainedModelMixin
2+
from .main import CausalLMModelMixin

0 commit comments

Comments
 (0)