feat(granite3): Add config plumbing for granite3-2b

gabe-l-hart · gabe-l-hart · commit 06566d98d7e7 · 2024-12-19T16:33:18.000-07:00
This does not yet implement the usage of the new multipliers in the
architecture, so the output is garbage at the moment.

NOTE: There is currently a bug where this model is missing tokenizer.json
in HF, but that should be resolved soon.

Branch: GraniteThreeDenseSupport

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -287,6 +287,11 @@ class TransformerArgs:
     feed_forward_bias: bool = False
     # Whether or not to tie the input word embeddings to the output
     tie_word_embeddings: bool = False
+    # Granite architecture multipliers
+    embedding_multiplier: Optional[float] = None
+    attention_multiplier: Optional[float] = None
+    residual_multiplier: Optional[float] = None
+    logits_scaling: Optional[float] = None
 
     def __post_init__(self):
         if self.n_local_heads == -1:
diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json
@@ -178,5 +178,12 @@
         "distribution_path": "ibm-granite/granite-8b-code-instruct-128k",
         "transformer_params_key": "Granite-8B-Code",
         "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.0-2b-instruct": {
+        "aliases": ["granite3-2b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.0-2b-instruct",
+        "transformer_params_key": "Granite-3.0-2B-Instruct",
+        "tokenizer_file": "tokenizer.json"
     }
 }
diff --git a/torchchat/model_params/Granite-3.0-2B-Instruct.json b/torchchat/model_params/Granite-3.0-2B-Instruct.json
@@ -0,0 +1,21 @@
+{
+    "block_size": 8192,
+    "dim": 2048,
+    "hidden_dim": 8192,
+    "n_heads": 32,
+    "n_local_heads": 8,
+    "n_layers": 40,
+    "rope_base": 10000,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "norm_eps": 0.00001,
+    "rope_scaling": null,
+    "attention_bias": false,
+    "feed_forward_bias": false,
+    "tie_word_embeddings": true,
+    "embedding_multiplier": 12.0,
+    "attention_multiplier": 0.015625,
+    "residual_multiplier": 0.22,
+    "logits_scaling": 8.0
+}

Original file line number	Diff line number	Diff line change
`@@ -178,5 +178,12 @@`
`178`	`178`	`"distribution_path": "ibm-granite/granite-8b-code-instruct-128k",`
`179`	`179`	`"transformer_params_key": "Granite-8B-Code",`
`180`	`180`	`"tokenizer_file": "tokenizer.json"`
	`181`	`+ },`
	`182`	`+ "ibm-granite/granite-3.0-2b-instruct": {`
	`183`	`+ "aliases": ["granite3-2b"],`
	`184`	`+ "distribution_channel": "HuggingFaceSnapshot",`
	`185`	`+ "distribution_path": "ibm-granite/granite-3.0-2b-instruct",`
	`186`	`+ "transformer_params_key": "Granite-3.0-2B-Instruct",`
	`187`	`+ "tokenizer_file": "tokenizer.json"`
`181`	`188`	`}`
`182`	`189`	`}`