vllm-project
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/pos_encoding_kernels.cu‎
Lines changed: 29 additions & 13 deletions b/‎csrc/pos_encoding_kernels.cu‎
Lines changed: 29 additions & 13 deletions
diff --git a/‎docs/source/models/supported_models.rst‎
Lines changed: 3 additions & 0 deletions b/‎docs/source/models/supported_models.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/llm_engine_example.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/llm_engine_example.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎vllm/config.py‎
Lines changed: 7 additions & 2 deletions b/‎vllm/config.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/attention.py‎
Lines changed: 24 additions & 12 deletions b/‎vllm/model_executor/layers/attention.py‎
Lines changed: 24 additions & 12 deletions
diff --git a/‎vllm/model_executor/model_loader.py‎
Lines changed: 2 additions & 0 deletions b/‎vllm/model_executor/model_loader.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎vllm/model_executor/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎vllm/model_executor/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -44,6 +44,7 @@ vLLM seamlessly supports many Huggingface models, including the following archit
 
 - Baichuan-7B (`baichuan-inc/Baichuan-7B`)
 - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
+- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
 - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
 - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
 - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
 
@@ -10,7 +10,8 @@ __global__ void rotary_embedding_neox_kernel(
   scalar_t* __restrict__ key,                   // [num_tokens, num_kv_heads, head_size]
   const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
   const int rot_dim,
-  const int stride,
+  const int query_stride,
+  const int key_stride,
   const int num_heads,
   const int num_kv_heads,
   const int head_size) {
@@ -23,14 +24,14 @@ __global__ void rotary_embedding_neox_kernel(
   const int nq = num_heads * embed_dim;
   for (int i = threadIdx.x; i < nq; i += blockDim.x) {
     const int head_idx = i / embed_dim;
-    const int token_head = token_idx * stride + head_idx * head_size;
+    const int token_head = token_idx * query_stride + head_idx * head_size;
 
     const int rot_offset = i % embed_dim;
     const int x_index = rot_offset;
     const int y_index = embed_dim + rot_offset;
 
-    const int out_x = token_idx * stride + head_idx * head_size + x_index;
-    const int out_y = token_idx * stride + head_idx * head_size + y_index;
+    const int out_x = token_idx * query_stride + head_idx * head_size + x_index;
+    const int out_y = token_idx * query_stride + head_idx * head_size + y_index;
 
     const scalar_t cos = __ldg(cache_ptr + x_index);
     const scalar_t sin = __ldg(cache_ptr + y_index);
@@ -39,13 +40,27 @@ __global__ void rotary_embedding_neox_kernel(
     const scalar_t q_y = query[token_head + y_index];
     query[out_x] = q_x * cos - q_y * sin;
     query[out_y] = q_y * cos + q_x * sin;
+  }
+
+  const int nk = num_kv_heads * embed_dim;
+  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
+    const int head_idx = i / embed_dim;
+    const int token_head = token_idx * key_stride + head_idx * head_size;
+
+    const int rot_offset = i % embed_dim;
+    const int x_index = rot_offset;
+    const int y_index = embed_dim + rot_offset;
+
+    const int out_x = token_idx * key_stride + head_idx * head_size + x_index;
+    const int out_y = token_idx * key_stride + head_idx * head_size + y_index;
+
+    const scalar_t cos = __ldg(cache_ptr + x_index);
+    const scalar_t sin = __ldg(cache_ptr + y_index);
 
-    if (head_idx < num_kv_heads) {
-      const scalar_t k_x = key[token_head + x_index];
-      const scalar_t k_y = key[token_head + y_index];
-      key[out_x] = k_x * cos - k_y * sin;
-      key[out_y] = k_y * cos + k_x * sin;
-    }
+    const scalar_t k_x = key[token_head + x_index];
+    const scalar_t k_y = key[token_head + y_index];
+    key[out_x] = k_x * cos - k_y * sin;
+    key[out_y] = k_y * cos + k_x * sin;
   }
 }
 
@@ -62,8 +77,8 @@ void rotary_embedding_neox(
   int rot_dim = cos_sin_cache.size(1);
   int num_heads = query.size(1) / head_size;
   int num_kv_heads = key.size(1) / head_size;
-  int stride = query.stride(0);
-  TORCH_CHECK(stride == key.stride(0));
+  int query_stride = query.stride(0);
+  int key_stride = key.stride(0);
 
   dim3 grid(num_tokens);
   dim3 block(std::min(num_heads * rot_dim / 2, 512));
@@ -80,7 +95,8 @@ void rotary_embedding_neox(
         key.data_ptr<scalar_t>(),
         cos_sin_cache.data_ptr<scalar_t>(),
         rot_dim,
-        stride,
+        query_stride,
+        key_stride,
         num_heads,
         num_kv_heads,
         head_size);
 
@@ -20,6 +20,9 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`BloomForCausalLM`
     - BLOOM, BLOOMZ, BLOOMChat
     - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
+  * - :code:`FalconForCausalLM`
+    - Falcon
+    - :code:`tiiuae/falcon-7b``, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
 
@@ -10,7 +10,8 @@ def main(args: argparse.Namespace):
 
     # Test the following prompts.
     test_prompts = [
-        ("A robot may not injure a human being", SamplingParams()),
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0)),
         ("To be or not to be,",
          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
         ("What is the meaning of life?",
 
@@ -94,8 +94,13 @@ def get_head_size(self) -> int:
         return self.hf_config.hidden_size // self.hf_config.num_attention_heads
 
     def get_num_heads(self, parallel_config: "ParallelConfig") -> int:
-        # For GPTBigCode:
-        if getattr(self.hf_config, "multi_query", False):
+        # For GPTBigCode & Falcon:
+        # Note: for falcon, when new_decoder_architecture is True, the
+        # multi_query flag is ignored and we use n_head_kv for the number of
+        # KV heads.
+        if (getattr(self.hf_config, "multi_query", False) and
+            (self.hf_config.model_type == "falcon" and
+             not getattr(self.hf_config, "new_decoder_architecture", False))):
             # Multi-query attention, only one KV head.
             return 1
         # For Falcon:
 
@@ -314,14 +314,13 @@ def forward(
 class PagedAttentionWithALiBi(PagedAttention):
     """PagedAttention with ALiBi attention bias."""
 
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        slopes: List[float],
-    ) -> None:
-        super().__init__(num_heads, head_size, scale)
+    def __init__(self,
+                 num_heads: int,
+                 head_size: int,
+                 scale: float,
+                 slopes: List[float],
+                 num_kv_heads: Optional[int] = None) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads)
         assert len(slopes) == num_heads
 
         slopes = torch.tensor(slopes, dtype=torch.float32)
@@ -334,6 +333,11 @@ def set_attn_bias(self, input_metadata: InputMetadata) -> None:
         # Generates ALiBi mask for each prompt.
         for prompt_len in input_metadata.prompt_lens:
             bias = torch.arange(prompt_len)
+            # Note(zhuohan): HF uses
+            #     `bias = bias[None, :].repeat(prompt_len, 1)`
+            # here. We find that both biases give the same results, but
+            # the bias below more accurately follows the original ALiBi
+            # paper.
             bias = bias[None, :] - bias[:, None]
             bias = bias.to(self.alibi_slopes.device)
 
@@ -363,10 +367,17 @@ def multi_query_kv_attention(
         Args:
             output: shape = [num_prompt_tokens, num_heads, head_size]
             query: shape = [num_prompt_tokens, num_heads, head_size]
-            key: shape = [num_prompt_tokens, num_heads, head_size]
-            value: shape = [num_prompt_tokens, num_heads, head_size]
+            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
+            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
             input_metadata: metadata for paged attention.
         """
+        if self.num_kv_heads != self.num_heads:
+            # Project the key and value tensors to the desired number of heads.
+            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
+            value = torch.repeat_interleave(value,
+                                            self.num_queries_per_kv,
+                                            dim=1)
+
         # FIXME(woosuk): Because xformers does not support dynamic sequence
         # lengths with custom attention bias, we process each prompt one by
         # one. This is inefficient, especially when we have many short prompts.
@@ -400,9 +411,10 @@ def single_query_cached_kv_attention(
         Args:
             output: shape = [num_generation_tokens, num_heads, head_size]
             query: shape = [num_generation_tokens, num_heads, head_size]
-            key_cache: shape = [num_blocks, num_heads, head_size/x,
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
                 block_size, x]
-            value_cache: shape = [num_blocks, num_heads, head_size, block_size]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
             input_metadata: metadata for paged attention.
         """
         block_size = value_cache.shape[3]
 
@@ -14,6 +14,7 @@
     "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
     "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
     "BloomForCausalLM": BloomForCausalLM,
+    "FalconForCausalLM": FalconForCausalLM,
     "GPT2LMHeadModel": GPT2LMHeadModel,
     "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
     "GPTJForCausalLM": GPTJForCausalLM,
@@ -22,6 +23,7 @@
     "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
     "MPTForCausalLM": MPTForCausalLM,
     "OPTForCausalLM": OPTForCausalLM,
+    "RWForCausalLM": FalconForCausalLM,
 }
 
 
 
@@ -1,5 +1,6 @@
 from vllm.model_executor.models.baichuan import BaiChuanForCausalLM, BaichuanForCausalLM
 from vllm.model_executor.models.bloom import BloomForCausalLM
+from vllm.model_executor.models.falcon import FalconForCausalLM
 from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
 from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
 from vllm.model_executor.models.gpt_j import GPTJForCausalLM
@@ -12,6 +13,7 @@
     "BaiChuanForCausalLM",
     "BaichuanForCausalLM",
     "BloomForCausalLM",
+    "FalconForCausalLM",
     "GPT2LMHeadModel",
     "GPTBigCodeForCausalLM",
     "GPTJForCausalLM",