feat: support setting the attention impl in hf_transformers

tjohnson31415 · njhill · commit c1c58f24a7ec · 2024-01-29T17:09:10.000-08:00
Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
diff --git a/server/text_generation_server/inference_engine/hf_transformers.py b/server/text_generation_server/inference_engine/hf_transformers.py
@@ -24,6 +24,10 @@ def __init__(
             "trust_remote_code": TRUST_REMOTE_CODE,
         }
 
+        # TODO: consider if Flash Attention should be enabled based on FLASH_ATTENTION=True
+        if attn_impl := os.getenv("TRANSFORMERS_ATTN_IMPL"):
+            kwargs["attn_implementation"] = attn_impl
+
         if model_config.model_type == "mpt":
             model_config.init_device = str(self.device)
             kwargs["config"] = model_config