We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent cfa10e3 commit c1c58f2Copy full SHA for c1c58f2
server/text_generation_server/inference_engine/hf_transformers.py
@@ -24,6 +24,10 @@ def __init__(
24
"trust_remote_code": TRUST_REMOTE_CODE,
25
}
26
27
+ # TODO: consider if Flash Attention should be enabled based on FLASH_ATTENTION=True
28
+ if attn_impl := os.getenv("TRANSFORMERS_ATTN_IMPL"):
29
+ kwargs["attn_implementation"] = attn_impl
30
+
31
if model_config.model_type == "mpt":
32
model_config.init_device = str(self.device)
33
kwargs["config"] = model_config
0 commit comments