faster startup of vLLM (#982)

ri938 · robirv938 · web-flow · commit 4b5bcf89065e · 2023-09-08T14:48:54.000+09:00
* update

---------

Co-authored-by: Robert Irvine &lt;robert@seamlessml.com&gt;
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
@@ -259,8 +259,9 @@ def __init__(
         self.is_neox_style = is_neox_style
 
         # Create the cos and sin cache.
-        inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
-        t = torch.arange(max_position).float()
+        inv_freq = 1.0 / (base**(
+            torch.arange(0, rotary_dim, 2, device="cuda") / rotary_dim))
+        t = torch.arange(max_position, device="cuda").float()
         freqs = torch.einsum("i,j -> ij", t, inv_freq.float())
         cos = freqs.cos()
         sin = freqs.sin()