doc: add comments for clarifying query / KV groups (#2093)

raishish · web-flow · commit d32a7384331c · 2025-07-09T12:03:28.000+02:00
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -408,7 +408,7 @@ def forward(
         qkv = self.qkv(x)  # (B, T, 3xC*)
 
         # Define query, key and value sizes.
-        # If grouped/multi query is enabled, these sizes are not equal (see the diagram in `lit_gpt/config.py::Config`).
+        # If grouped/multi query is enabled, these sizes are not equal (see the diagram above).
         query_size = n_head * head_size
         key_size = value_size = n_query_groups * head_size
         # Split qkv into query, key and value matrices.
@@ -420,9 +420,12 @@ def forward(
 
         # To place the num_heads (nh) dimension right after the batch (B) dimension, the first step is to decouple the
         # embedding size (C) into num_heads (nh) and head_size (hs).
+
+        # The original GQA paper is followed here and the term query groups is used.
+        # alternative notation: Query groups are also referred to as KV groups.
         q = q.view(B, T, n_head, head_size)  # (B, T, nh_q, hs)
-        k = k.view(B, T, n_query_groups, head_size)  # (B, T, nh_k, hs)
-        v = v.view(B, T, n_query_groups, head_size)  # (B, T, nh_v, hs)
+        k = k.view(B, T, n_query_groups, head_size)  # (B, T, n_query_groups, hs)
+        v = v.view(B, T, n_query_groups, head_size)  # (B, T, n_query_groups, hs)
 
         # The tensors `query`, `key`, and `value` are now accurately structured: within each batch element (B), there are
         # multiple heads (nh), and within each head, there is a sequence of elements (T), each represented by a vector