bump: update to PyTorch 2.2 and Flash 2.5.2

tjohnson31415 · njhill · njhill · commit 2e377454da55 · 2024-02-08T14:57:10.000-08:00
Co-authored-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
diff --git a/Dockerfile b/Dockerfile
@@ -1,9 +1,9 @@
 ## Global Args #################################################################
 ARG BASE_UBI_IMAGE_TAG=9.3-1552
 ARG PROTOC_VERSION=25.2
-#ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
-ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
-ARG PYTORCH_VERSION=2.3.0.dev20240125
+ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
+# ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
+ARG PYTORCH_VERSION=2.2.0
 ARG PYTHON_VERSION=3.11
 
 ## Base Layer ##################################################################
@@ -205,7 +205,7 @@ RUN pip install torch==$PYTORCH_VERSION+cu118 --index-url "${PYTORCH_INDEX}/cu11
 
 ## Build flash attention v2 ####################################################
 FROM python-builder as flash-att-v2-builder
-ARG FLASH_ATT_VERSION=v2.3.6
+ARG FLASH_ATT_VERSION=v2.5.2
 
 WORKDIR /usr/src/flash-attention-v2
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -243,20 +243,16 @@ def forward(
         self.rotary_emb(query, cos, sin)
         self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
 
-        # output tensor
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if layer_past_present_indices is None:
             # Copy to layer past
             layer_past[...] = kv
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 torch.select(kv, dim=1, index=0),
                 torch.select(kv, dim=1, index=1),
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -267,11 +263,10 @@ def forward(
             layer_past[layer_past_present_indices] = kv
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 layer_past[:, 0],
                 layer_past[:, 1],
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -280,7 +275,7 @@ def forward(
                 False,
             )
 
-        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.o_proj(attn_output.reshape(-1, self.num_heads * self.head_size))
 
 
 class LlamaMLP(nn.Module):
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -135,20 +135,16 @@ def forward(
 
         query = qkv[:, 0]
 
-        # output tensor
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if layer_past_present_indices is None:
             # Copy to layer past
             layer_past[...] = qkv[:, 1:]
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 qkv[:, 1],
                 qkv[:, 2],
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -159,11 +155,10 @@ def forward(
             layer_past[layer_past_present_indices] = qkv[:, 1:]
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 layer_past[:, 0],
                 layer_past[:, 1],
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -172,7 +167,7 @@ def forward(
                 False,
             )
 
-        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.dense(attn_output.reshape(-1, self.num_heads * self.head_size))
 
 
 class FlashMLP(nn.Module):
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -175,20 +175,16 @@ def forward(
         self.rotary_emb(query, cos, sin)
         self.rotary_emb(kv[:, 0], cos, sin)
 
-        # output
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if layer_past_present_indices is None:
             # Copy to layer past
             layer_past[...] = kv
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 torch.select(kv, dim=1, index=0),
                 torch.select(kv, dim=1, index=1),
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -199,11 +195,10 @@ def forward(
             layer_past[layer_past_present_indices] = kv
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 layer_past[:, 0],
                 layer_past[:, 1],
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -212,7 +207,7 @@ def forward(
                 False,
             )
 
-        return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.dense(attn_output.reshape(-1, self.num_heads * self.head_size))
 
 
 class FlashRWLargeAttention(torch.nn.Module):
@@ -286,20 +281,16 @@ def forward(
         self.rotary_emb(query, cos, sin)
         self.rotary_emb(kv[:, :, 0], cos, sin)
 
-        # output
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if layer_past_present_indices is None:
             # Copy to layer past
             layer_past[...] = kv
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 torch.select(kv, dim=2, index=0),
                 torch.select(kv, dim=2, index=1),
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -310,11 +301,10 @@ def forward(
             layer_past[layer_past_present_indices] = kv
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 layer_past[:, :, 0],
                 layer_past[:, :, 1],
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -243,20 +243,16 @@ def forward(
         query = query.view(-1, self.num_heads, self.head_size)
         key_value = key_value.view(-1, 2, 1, self.head_size)
 
-        # output
-        attn_output = torch.empty_like(query)
-
         # Prefill
         if layer_past_present_indices is None:
             # Copy to layer past
             layer_past[...] = key_value
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 torch.select(key_value, dim=1, index=0),
                 torch.select(key_value, dim=1, index=1),
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -267,11 +263,10 @@ def forward(
             layer_past[layer_past_present_indices] = key_value
 
             # flash attention
-            attention(
+            attn_output = attention(
                 query,
                 layer_past[:, 0],
                 layer_past[:, 1],
-                attn_output,
                 cu_seqlens,
                 max_s,
                 self.softmax_scale,
@@ -280,7 +275,7 @@ def forward(
                 False,
             )
 
-        return self.c_proj(attn_output.view(-1, self.num_heads * self.head_size))
+        return self.c_proj(attn_output.reshape(-1, self.num_heads * self.head_size))
 
 
 class MLP(nn.Module):
diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py
@@ -44,7 +44,6 @@ def attention(
     q,
     k,
     v,
-    out,
     cu_seqlens,
     max_s,
     softmax_scale,
@@ -61,10 +60,11 @@ def attention(
             q,
             k,
             v,
-            out,
+            None,
             cu_seqlens_q,
             cu_seqlens,
             None,
+            None,
             max_s_q,
             max_s,
             0.0,
@@ -75,7 +75,7 @@ def attention(
             -1,
             False,
             None,
-        )
+        )[0]
 
     if HAS_FLASH_ATTN:
         # Flash attention v1 requires q, k and v to have the same number of heads
@@ -104,7 +104,8 @@ def attention(
                     .reshape(original_shape[0], -1, original_shape[2])
                 )
 
-        return flash_attn_cuda.fwd(
+        out = torch.empty_like(q)
+        flash_attn_cuda.fwd(
             q,
             k,
             v,
@@ -121,5 +122,6 @@ def attention(
             0,
             None,
         )
+        return out
 
     raise NotImplementedError("flash attention is not installed")