fix typos

Jackmin801 · samsja · commit debf131c6770 · 2024-07-11T02:13:09.000Z
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
 # Development workflow
 
-This is the develpment workflow of prime intellect to build upon hivemind
+This is the development workflow of prime intellect to build upon hivemind
 
 ## Install dependencies
 
diff --git a/open_diloco/train_fsdp.py b/open_diloco/train_fsdp.py
@@ -116,7 +116,7 @@ def cast_str_to_list(cls, values: dict[str, Any]) -> dict[str, Any]:
 class Config(BaseConfig):
     path_model: str = "PrimeIntellect/llama-150m-fresh"
     torch_compile: bool = True
-    attn_implementation: str = "flash_attention_2"
+    attn_implementation: str = "sdpa"
     # Data
     dataset_name_or_path: str = "allenai/c4"
     seq_length: int = 1024
diff --git a/open_diloco/utils.py b/open_diloco/utils.py
@@ -117,7 +117,7 @@ def get_compression_kwargs(hivemind_compression: str) -> dict:
 
 def found_inf_grad(optimizer: torch.optim.Optimizer, scaler: torch.cuda.amp.GradScaler) -> bool:
     """
-    this function check if the scaler has found inf grad for the optimizer. It does by looking up the optimzier state
+    this function check if the scaler has found inf grad for the optimizer. It does by looking up the optimizer state
     regsited inside the scaler. Code is mostly copied/inspired by the torch GradScaler codebase.
     """
     if not scaler._enabled: