diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 16af71806ad..07566bfb28e 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -1,7 +1,9 @@
+import os
 import math
 import torch
 import torch.nn.functional as F
 import transformers
+import deepspeed
 from typing import List, Mapping, NewType, Optional, Tuple, Union
 from tqdm import tqdm
 
@@ -77,6 +79,7 @@ def __init__(
         offload_folder: Optional[str] = "./offload",
         dtype: Optional[Union[str, torch.dtype]] = None,
         device: Optional[Union[int, str]] = "cuda",
+        use_deepspeed: Optional[bool] = False
     ):
         """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
 
@@ -147,6 +150,7 @@ def __init__(
         self._batch_size = batch_size  # TODO: Adaptive batch size
         self._max_gen_toks = max_gen_toks
         self._max_length = max_length
+        self.use_deepspeed = use_deepspeed
         self._config = self.AUTO_CONFIG_CLASS.from_pretrained(
             pretrained,
             revision=revision + ("/" + subfolder if subfolder is not None else ""),
@@ -185,7 +189,7 @@ def __init__(
             # the user specified one so we force `self._device` to be the same as
             # `lm_head`'s.
             self._device = self.model.hf_device_map["lm_head"]
-        if not use_accelerate:
+        if not use_accelerate and not use_deepspeed:
             self.model.to(self._device)
 
     def _create_auto_model(
@@ -199,15 +203,38 @@ def _create_auto_model(
         offload_folder: Optional[str] = None,
         torch_dtype: Optional[Union[str, torch.dtype]] = None,
     ) -> transformers.AutoModel:
-        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
-        model = self.AUTO_MODEL_CLASS.from_pretrained(
-            pretrained,
-            revision=revision + ("/" + subfolder if subfolder is not None else ""),
-            device_map=device_map,
-            max_memory=max_memory,
-            offload_folder=offload_folder,
-            torch_dtype=torch_dtype,
-        )
+        if self.use_deepspeed:
+            # Deepspeed iniatilization
+            world_size = int(os.getenv("WORLD_SIZE", "1"))
+            deepspeed.init_distributed("nccl")
+
+            with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
+                model = self.AUTO_MODEL_CLASS.from_config(self._config, torch_dtype=torch.bfloat16)
+
+            model = model.eval()
+            checkpoints_json = os.path.join(pretrained, "ds_inference_config.json")
+            tp_config = deepspeed.inference.config.DeepSpeedTPConfig()
+            tp_config.tp_size = world_size
+
+            model = deepspeed.init_inference(
+                model,
+                tensor_parallel=tp_config,
+                base_dir=pretrained,
+                dtype=torch_dtype,
+                checkpoint=checkpoints_json,
+                replace_with_kernel_inject=True,
+
+            )
+        else:
+            """Returns a pre-trained pytorch model from a pre-trained model configuration."""
+            model = self.AUTO_MODEL_CLASS.from_pretrained(
+                pretrained,
+                revision=revision + ("/" + subfolder if subfolder is not None else ""),
+                device_map=device_map,
+                max_memory=max_memory,
+                offload_folder=offload_folder,
+                torch_dtype=torch_dtype,
+            )
         return model
 
     def _create_auto_tokenizer(
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 03d24e52d82..b9c9773db9d 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -35,6 +35,7 @@
 from . import tydiqa
 from . import wino_bias
 from . import wmt
+from . import xnli
 from . import xquad
 
 
@@ -147,6 +148,9 @@
     # Bias-Shades
     # Format: `bias_shades_{lang}`
     **bias_shades.construct_tasks(),
+    # XNLI
+    # Format: `xnli_{lang}`
+    **xnli.construct_tasks(),
     # BLiMP
     "blimp_adjunct_island": blimp.BlimpAdjunctIsland,
     "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
diff --git a/lm_eval/tasks/xnli.py b/lm_eval/tasks/xnli.py
new file mode 100644
index 00000000000..e54139573da
--- /dev/null
+++ b/lm_eval/tasks/xnli.py
@@ -0,0 +1,74 @@
+"""
+XNLI is an evaluation corpus for language transfer and cross-lingual sentence classification in 15 languages.
+https://arxiv.org/abs/1809.05053
+Homepage: None, Repo: https://github.com/facebookresearch/XNLI
+"""
+import typing
+
+from lm_eval.api.task import PromptSourceTask
+
+
+_CITATION = """
+@inproceedings{conneau2018xnli,
+  title={XNLI: Evaluating Cross-lingual Sentence Representations},
+  author={Conneau, Alexis and Rinott, Ruty and Lample, Guillaume and Williams, Adina and Bowman, Samuel and Schwenk, Holger and Stoyanov, Veselin},
+  booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
+  pages={2475--2485},
+  year={2018}
+}
+}"""
+
+
+class XNLI(PromptSourceTask):
+    VERSION = 1
+    DATASET_PATH = "xnli"
+    DATASET_NAME = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.dataset["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+
+
+class XNLIEn(XNLI):
+    DATASET_NAME = "en"
+
+
+class XNLIFr(XNLI):
+    DATASET_NAME = "fr"
+
+
+XNLI_TASKS = [
+    XNLIEn,
+    XNLIFr,
+]
+
+
+def construct_tasks() -> typing.Dict[str, XNLI]:
+    """
+    Returns a dictionary of tasks keyed by task name, for example:
+        "GEM/wiki_lingua_ar"
+    will dispatch to the GEM WikiLingua Arabic class.
+    """
+    tasks = {}
+    for task_class in XNLI_TASKS:
+        benchmark = task_class.DATASET_PATH
+        lang = task_class.DATASET_NAME
+        tasks[f"{benchmark}_{lang}"] = task_class
+    return tasks
diff --git a/main.py b/main.py
index ffd8633e710..d02fe5a1013 100644
--- a/main.py
+++ b/main.py
@@ -103,6 +103,13 @@ def parse_args():
         action="store_true",
         help="Whether to cache your model's predictions or not",
     )
+    # Need it for deepspeed inference
+    parser.add_argument(
+        "--local_rank",
+        default=0,
+        type=int,
+        help="used by dist launchers"
+    )
     return parser.parse_args()
 
 
@@ -204,9 +211,10 @@ def main():
     with open(f"./outputs/slim{path_separator}{output_path}.json", "w") as f:
         json.dump(agg2slim(results), f, indent=2)
 
-    print(f"\n{evaluator.make_table(results)}")
+    if args.local_rank == 0:
+        print(f"\n{evaluator.make_table(results)}")
 
-    if not args.no_tracking:
+    if not args.no_tracking and args.local_rank == 0:
         emissions_output_path = f"./outputs/emissions{path_separator}{output_path}.csv"
         os.rename("emissions.csv", emissions_output_path)