support mqa in checkpoint-merging tools

RaymondLi0 · RaymondLi0 · commit bd1280263320 · 2023-03-21T23:39:57.000-04:00
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -430,7 +430,7 @@ def _add_network_size_args(parser):
                        'attention. This is set to '
                        '   args.hidden_size // args.num_attention_heads '
                        'if not provided.')
-    group.add_argument('--attention-head-type', type=str, default='multihead',
+    group.add_argument('--attention-head-type', type=str, default=None,
                        choices=['multihead', 'multiquery'],
                        help='Type of attention heads. `multihead` is the standard multi-head attention.'
                        '`multiquery` shares the values and keys across attention heads')
diff --git a/tools/checkpoint_loader_megatron.py b/tools/checkpoint_loader_megatron.py
@@ -78,6 +78,7 @@ def check_for_arg(arg_name):
     check_for_arg('iteration')
     check_for_arg('bert_binary_head')
     check_for_arg('params_dtype')
+    check_for_arg('attention_head_type')
 
     # Determine how to make our models
     if args.model_type == 'GPT':
@@ -147,6 +148,7 @@ def get_models(count, dtype, pre_process, post_process):
     # metadata
     md = types.SimpleNamespace()
     md.model_type = args.model_type
+    md.attention_head_type = margs.attention_head_type
     md.num_layers = margs.num_layers
     md.hidden_size = margs.hidden_size
     md.seq_length = margs.seq_length
@@ -202,26 +204,40 @@ def queue_put(name, msg):
             message["post layernorm weight"] = layer.post_attention_layernorm.weight.data
             message["post layernorm bias"] = layer.post_attention_layernorm.bias.data
             message["mlp l1 bias"] = layer.mlp.dense_4h_to_h.bias.data
+            if margs.attention_head_type == "multiquery":
+                # MQA: kv is shared across tp-ranks
+                message["kv weight"] = layer.self_attention.key_value.weight.data
+                message["kv bias"] = layer.self_attention.key_value.bias.data
 
             # Grab all parallel tensors for this layer
             qkv_weight = []
             qkv_bias = []
+            q_weight = []
+            q_bias = []
             dense_weight = []
             mlp_l0_weight = []
             mlp_l0_bias = []
             mlp_l1_weight = []
             for tp_rank, model in enumerate(models):
                 layer = model.language_model.encoder.layers[layer_num]
-                qkv_weight.append(layer.self_attention.query_key_value.weight.data)
-                qkv_bias.append(layer.self_attention.query_key_value.bias.data)
+                if margs.attention_head_type == "multihead":
+                    qkv_weight.append(layer.self_attention.query_key_value.weight.data)
+                    qkv_bias.append(layer.self_attention.query_key_value.bias.data)
+                elif margs.attention_head_type == "multiquery":
+                    q_weight.append(layer.self_attention.query.weight.data)
+                    q_bias.append(layer.self_attention.query.bias.data)
                 dense_weight.append(layer.self_attention.dense.weight.data)
                 mlp_l0_weight.append(layer.mlp.dense_h_to_4h.weight.data)
                 mlp_l0_bias.append(layer.mlp.dense_h_to_4h.bias.data)
                 mlp_l1_weight.append(layer.mlp.dense_4h_to_h.weight.data)
 
             # concat them
-            message["qkv weight"] = torch.cat(qkv_weight, dim=0)
-            message["qkv bias"] = torch.cat(qkv_bias, dim=0)
+            if margs.attention_head_type == "multihead":
+                message["qkv weight"] = torch.cat(qkv_weight, dim=0)
+                message["qkv bias"] = torch.cat(qkv_bias, dim=0)
+            elif margs.attention_head_type == "multiquery":
+                message["q weight"] = torch.cat(q_weight, dim=0)
+                message["q bias"] = torch.cat(q_bias, dim=0)
             message["dense weight"] = torch.cat(dense_weight, dim=1)
             message["mlp l0 weight"] = torch.cat(mlp_l0_weight, dim=0)
             message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0)
diff --git a/tools/checkpoint_saver_megatron.py b/tools/checkpoint_saver_megatron.py
@@ -95,6 +95,7 @@ def check_message(msg):
                 '--seq-length', str(md.seq_length),
                 '--num-attention-heads', str(md.num_attention_heads),
                 '--max-position-embeddings', str(md.max_position_embeddings),
+                '--attention-head-type', str(md.attention_head_type),
                 '--tokenizer-type', str(md.tokenizer_type),
                 '--tensor-model-parallel-size', str(args.target_tensor_parallel_size),
                 '--pipeline-model-parallel-size', str(args.target_pipeline_parallel_size),
@@ -225,10 +226,17 @@ def get_models(count, dtype, pre_process, post_process):
             post_layernorm_weight = msg.pop("post layernorm weight")
             post_layernorm_bias = msg.pop("post layernorm bias")
             mlp_l1_bias = msg.pop("mlp l1 bias")
+            if margs.attention_head_type == "multiquery":
+                kv_weight = msg.pop("kv weight")
+                kv_bias = msg.pop("kv bias")
 
             # Split up the parallel tensors
-            qkv_weight = torch.chunk(msg.pop("qkv weight"), args.target_tensor_parallel_size, dim=0)
-            qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0)
+            if margs.attention_head_type == "multihead":
+                qkv_weight = torch.chunk(msg.pop("qkv weight"), args.target_tensor_parallel_size, dim=0)
+                qkv_bias = torch.chunk(msg.pop("qkv bias"), args.target_tensor_parallel_size, dim=0)
+            elif margs.attention_head_type == "multiquery":
+                q_weight = torch.chunk(msg.pop("q weight"), args.target_tensor_parallel_size, dim=0)
+                q_bias = torch.chunk(msg.pop("q bias"), args.target_tensor_parallel_size, dim=0)
             dense_weight = torch.chunk(msg.pop("dense weight"), args.target_tensor_parallel_size, dim=1)
             mlp_l0_weight = torch.chunk(msg.pop("mlp l0 weight"), args.target_tensor_parallel_size, dim=0)
             mlp_l0_bias = torch.chunk(msg.pop("mlp l0 bias"), args.target_tensor_parallel_size, dim=0)
@@ -239,8 +247,15 @@ def get_models(count, dtype, pre_process, post_process):
                 l = models[tp_rank].language_model.encoder.layers[layer]
                 l.input_layernorm.weight.data.copy_(input_layernorm_weight)
                 l.input_layernorm.bias.data.copy_(input_layernorm_bias)
-                l.self_attention.query_key_value.weight.data.copy_(qkv_weight[tp_rank])
-                l.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank])
+                if margs.attention_head_type == "multihead":
+                    l.self_attention.query_key_value.weight.data.copy_(qkv_weight[tp_rank])
+                    l.self_attention.query_key_value.bias.data.copy_(qkv_bias[tp_rank])
+                elif margs.attention_head_type == "multiquery":
+                    # MQA: key-value are shared across tp-ranks
+                    l.self_attention.key_value.weight.data.copy_(kv_weight)
+                    l.self_attention.key_value.bias.data.copy_(kv_bias)
+                    l.self_attention.query.weight.data.copy_(q_weight[tp_rank])
+                    l.self_attention.query.bias.data.copy_(q_bias[tp_rank])
                 l.self_attention.dense.weight.data.copy_(dense_weight[tp_rank])
                 l.self_attention.dense.bias.data.copy_(dense_bias)
                 l.post_attention_layernorm.weight.data.copy_(post_layernorm_weight)