Merge remote-tracking branch 'upstream/main'

Xaenalt · Xaenalt · commit 2c454e438496 · 2024-05-09T13:43:34.000-04:00
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -156,10 +156,9 @@ def _load_gqa(config, prefix: str, weights):
     assert config.hidden_size % config.num_attention_heads == 0
     assert config.num_attention_heads % weights.process_group.size() == 0
 
+    prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
     weight = weights.get_multi_weights_col(
-        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
-        dim=0
+        prefixes=prefixes, quantize=config.quantize, dim=0
     )
 
     if config.quantize != "gptq":
@@ -173,7 +172,12 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
             ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=config.attention_bias, quantize=config.quantize))
+    if config.attention_bias:
+        bias = torch.cat([weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes], dim=0)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=bias, quantize=config.quantize))
 
 
 class FlashLlamaAttention(torch.nn.Module):
diff --git a/server/text_generation_server/models/custom_modeling/paged_llama_modeling.py b/server/text_generation_server/models/custom_modeling/paged_llama_modeling.py
@@ -156,10 +156,9 @@ def _load_gqa(config, prefix: str, weights):
     assert config.hidden_size % config.num_attention_heads == 0
     assert config.num_attention_heads % weights.process_group.size() == 0
 
+    prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"]
     weight = weights.get_multi_weights_col(
-        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
-        quantize=config.quantize,
-        dim=0
+        prefixes=prefixes, quantize=config.quantize, dim=0
     )
 
     if config.quantize != "gptq":
@@ -173,7 +172,12 @@ def _load_gqa(config, prefix: str, weights):
             config.hidden_size,
             ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
 
-    return TensorParallelColumnLinear(get_linear(weight, bias=config.attention_bias, quantize=config.quantize))
+    if config.attention_bias:
+        bias = torch.cat([weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes], dim=0)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias=bias, quantize=config.quantize))
 
 
 class PagedLlamaAttention(torch.nn.Module):
diff --git a/server/text_generation_server/models/paged_causal_lm.py b/server/text_generation_server/models/paged_causal_lm.py
@@ -333,6 +333,9 @@ def __init__(
             total_num_gpu_blocks=total_num_gpu_blocks,
         )
 
+        # log number of free blocks at init
+        print("[PagedKVCacheManager] number of free blocks: %d" % (len(self.kv_cache_manager.free_blocks)))
+
     @property
     def batch_type(self) -> Type[PagedCausalLMBatch]:
         return self._batch_type

Original file line number	Diff line number	Diff line change
`@@ -333,6 +333,9 @@ def __init__(`
`333`	`333`	`total_num_gpu_blocks=total_num_gpu_blocks,`
`334`	`334`	`)`
`335`	`335`
	`336`	`+ # log number of free blocks at init`
	`337`	`+ print("[PagedKVCacheManager] number of free blocks: %d" % (len(self.kv_cache_manager.free_blocks)))`
	`338`	`+`
`336`	`339`	`@property`
`337`	`340`	`def batch_type(self) -> Type[PagedCausalLMBatch]:`
`338`	`341`	`return self._batch_type`