fix: CUDA error when inferencing with Falcon-40B base model (#992)

kyujin-cho · web-flow · commit 898285c9bf30 · 2023-09-10T01:39:02.000-07:00
diff --git a/vllm/config.py b/vllm/config.py
@@ -114,8 +114,9 @@ def get_num_heads(self, parallel_config: "ParallelConfig") -> int:
         # Note: for falcon, when new_decoder_architecture is True, the
         # multi_query flag is ignored and we use n_head_kv for the number of
         # KV heads.
+        falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
         new_decoder_arch_falcon = (
-            self.hf_config.model_type == "falcon"
+            self.hf_config.model_type in falcon_model_types
             and getattr(self.hf_config, "new_decoder_architecture", False))
         if not new_decoder_arch_falcon and getattr(self.hf_config,
                                                    "multi_query", False):