Updating eval for lm_eval 0.4 and 0.3

HDCharles · HDCharles · commit eb1789be0bdb · 2024-02-05T16:01:42.000-08:00
Summary: 0.4 broke BC, this fixes regardless of version Test Plan: (on both versions and without lm_eval installed) python quantize.py --mode int8 (on both versions) python eval.py --tasks wikitext wikitext: {'word_perplexity,none': 12.212490471702079, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.59675331009031, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.6751414412399839, 'bits_per_byte_stderr,none': 'N/A', 'alias': 'wikitext'} For model checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth wikitext: {'word_perplexity': 12.212490471702079, 'byte_perplexity': 1.59675331009031, 'bits_per_byte': 0.6751414412399839} Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: f538368 Pull Request resolved: #91
diff --git a/GPTQ.py b/GPTQ.py
@@ -3,158 +3,151 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import os
-import sys
 
 import torch
 
-lm_evaluation_harness_path = "/".join(
-    os.getcwd().split("/")[:-1] + ["lm-evaluation-harness"]
-)
-sys.path.insert(0, lm_evaluation_harness_path)
-import main as lm_evaluation_harness_main
 import torch.fx as fx
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils._pytree import tree_flatten, tree_unflatten
 
-from eval import setup_cache_padded_seq_input_pos_max_seq_length_for_prefill
-from generate import encode_tokens
-
 aten = torch.ops.aten
 
-try:
-    import lm_eval
-    class InputRecorder(lm_eval.base.BaseLM):
-        """
-        This is a fake evaluation wrapper that just records the inputs
-        so that they can be used in calibration.
-
-        If pad_calibration_inputs is enabled, the input recorder will take
-        each input and pad/truncate it down to the calibration_seq_length.
-        It will also edit the model embeddings to be zero for the 0 token used
-        in padding and avoid any inputs with the 0 token.
-
-        If not, it will only truncate inputs to the desired length.
-        """
-
-        def __init__(
-            self,
-            model,
-            tokenizer,
-            calibration_seq_length,
-            pad_calibration_inputs=False,
-        ):
-            super().__init__()
-            self._model = model
-            self._tokenizer = tokenizer
-            self._device = torch.device("cpu")
-            self.vocab_size = model.config.vocab_size
-            self.calibration_seq_length = calibration_seq_length
-            self.pad_calibration_inputs = pad_calibration_inputs
-            self.inputs = None
-
-            if self.pad_calibration_inputs:
-                # This is needed for the pad_calibration_inputs option
-                # to work properly, the 0 token's embeddings are set to 0 so that
-                # the padded inputs will not affect the model numerics. This token isn't used
-                # commonly in the eval tasks for the meta-llama tokenizer and we skip any inputs
-                # where it appears
-                try:
-                    if isinstance(self._model.transformer.wte, nn.Embedding):
-                        self.mod.transformer.wte.weight.data[0, :] *= 0
-                except:
-                    print(
-                        "Did not find embeddings in model.transformer.wte, disabling padding"
-                    )
-                    self.pad_calibration_inputs = False
+from eval import (
+    setup_cache_padded_seq_input_pos_max_seq_length_for_prefill,
+    encode_tokens,
+    eval_wrapper
+)
 
-        @property
-        def eot_token_id(self):
-            return self._tokenizer.eos_id()
 
-        @property
-        def max_length(self):
-            return self.calibration_seq_length
+class InputRecorder(eval_wrapper):
+    """
+    This is a fake evaluation wrapper that just records the inputs
+    so that they can be used in calibration.
 
-        @property
-        def max_gen_toks(self):
-            return 50
+    If pad_calibration_inputs is enabled, the input recorder will take
+    each input and pad/truncate it down to the calibration_seq_length.
+    It will also edit the model embeddings to be zero for the 0 token used
+    in padding and avoid any inputs with the 0 token.
 
-        @property
-        def batch_size(self):
-            return 1
+    If not, it will only truncate inputs to the desired length.
+    """
 
-        @property
-        def device(self):
-            return self._device
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        calibration_seq_length,
+        pad_calibration_inputs=False,
+    ):
+        super().__init__()
+        self._model = model
+        self._tokenizer = tokenizer
+        self._device = torch.device("cpu")
+        self.vocab_size = model.config.vocab_size
+        self.calibration_seq_length = calibration_seq_length
+        self.pad_calibration_inputs = pad_calibration_inputs
+        self.inputs = None
+
+        if self.pad_calibration_inputs:
+            # This is needed for the pad_calibration_inputs option
+            # to work properly, the 0 token's embeddings are set to 0 so that
+            # the padded inputs will not affect the model numerics. This token isn't used
+            # commonly in the eval tasks for the meta-llama tokenizer and we skip any inputs
+            # where it appears
+            try:
+                if isinstance(self._model.transformer.wte, nn.Embedding):
+                    self.mod.transformer.wte.weight.data[0, :] *= 0
+            except:
+                print(
+                    "Did not find embeddings in model.transformer.wte, disabling padding"
+                )
+                self.pad_calibration_inputs = False
 
-        def tok_encode(self, string: str):
-            encoded = encode_tokens(
-                self._tokenizer, string, bos=True, device=self._device
-            )
-            # encoded is a pytorch tensor, but some internal logic in the
-            # eval harness expects it to be a list instead
-            # TODO: verify this for multi-batch as well
-            encoded = encoded.tolist()
-            return encoded
-
-        def tok_decode(self, tokens):
-            decoded = self._tokenizer.decode(tokens)
-            return decoded
-
-        def add_input(self, args):
-            if self.inputs is None:
-                self.inputs = [MultiInput([arg]) for arg in args]
-            else:
-                self.inputs = [
-                    multi.add_input(arg) for (multi, arg) in zip(self.inputs, args)
-                ]
+    @property
+    def eot_token_id(self):
+        return self._tokenizer.eos_id()
 
-        def get_recorded_inputs(self):
-            return self.inputs
+    @property
+    def max_length(self):
+        return self.calibration_seq_length
 
-        def _model_call(self, inps):
-            inps = inps.squeeze(0)
-            T = len(inps)
-            if (
-                # can't use inputs that are too short when padding disabled
-                (T < self.calibration_seq_length and not self.pad_calibration_inputs)
-                or
-                # can't use inputs that actually use token we use for padding
-                (self.pad_calibration_inputs and 0 in inps)
-            ):
-                # give random output
-                return torch.randn(
-                    (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
-                )
+    @property
+    def max_gen_toks(self):
+        return 50
 
-            # pad or truncate to the right size
-            if T >= self.calibration_seq_length:
-                inps = inps[: self.calibration_seq_length]
-            else:
-                inps = F.pad(inps, (0, self.calibration_seq_length - T))
-
-            max_new_tokens = 1
-            (
-                seq,
-                input_pos,
-                max_seq_length,
-            ) = setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
-                self._model, inps, max_new_tokens, self.max_length
-            )
-            x = seq.index_select(0, input_pos).view(1, -1)
-            self.add_input((x, input_pos))
+    @property
+    def batch_size(self):
+        return 1
 
-            # output `something` with correct shape to keep eval going
+    @property
+    def device(self):
+        return self._device
+
+    def tok_encode(self, string: str):
+        encoded = encode_tokens(
+            self._tokenizer, string, bos=True, device=self._device
+        )
+        # encoded is a pytorch tensor, but some internal logic in the
+        # eval harness expects it to be a list instead
+        # TODO: verify this for multi-batch as well
+        encoded = encoded.tolist()
+        return encoded
+
+    def tok_decode(self, tokens):
+        decoded = self._tokenizer.decode(tokens)
+        return decoded
+
+    def add_input(self, args):
+        if self.inputs is None:
+            self.inputs = [MultiInput([arg]) for arg in args]
+        else:
+            self.inputs = [
+                multi.add_input(arg) for (multi, arg) in zip(self.inputs, args)
+            ]
+
+    def get_recorded_inputs(self):
+        return self.inputs
+
+    def _model_call(self, inps):
+        inps = inps.squeeze(0)
+        T = len(inps)
+        if (
+            # can't use inputs that are too short when padding disabled
+            (T < self.calibration_seq_length and not self.pad_calibration_inputs)
+            or
+            # can't use inputs that actually use token we use for padding
+            (self.pad_calibration_inputs and 0 in inps)
+        ):
+            # give random output
             return torch.randn(
                 (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
             )
 
-        def _model_generate(self, context, max_length, eos_token_id):
-            raise Exception("unimplemented")
-except ImportError:
-    pass
+        # pad or truncate to the right size
+        if T >= self.calibration_seq_length:
+            inps = inps[: self.calibration_seq_length]
+        else:
+            inps = F.pad(inps, (0, self.calibration_seq_length - T))
+
+        max_new_tokens = 1
+        (
+            seq,
+            input_pos,
+            max_seq_length,
+        ) = setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
+            self._model, inps, max_new_tokens, self.max_length
+        )
+        x = seq.index_select(0, input_pos).view(1, -1)
+        self.add_input((x, input_pos))
+
+        # output `something` with correct shape to keep eval going
+        return torch.randn(
+            (1, T, self.vocab_size), dtype=torch.bfloat16, device=self._device
+        )
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        raise Exception("unimplemented")
 
 
 class MultiInput:
diff --git a/eval.py b/eval.py
@@ -18,26 +18,32 @@
 torch._inductor.config.triton.cudagraphs = True
 torch._dynamo.config.cache_size_limit = 100000
 
-# support running without installing as a package
-wd = Path(__file__).parent.parent.resolve()
-sys.path.append(str(wd))
-
-# hacky path setup for lm-evaluation-harness
-import os
-import sys
-
 from sentencepiece import SentencePieceProcessor
 
 from model import Transformer
 
-lm_evaluation_harness_path = '/'.join(
-    os.getcwd().split('/')[:-1] + ['lm-evaluation-harness'])
-sys.path.insert(0, lm_evaluation_harness_path)
-import lm_eval
-import main as lm_evaluation_harness_main
+try:
+    import lm_eval
+    lm_eval_available = True
+except:
+    lm_eval_available = False
 
 from generate import _load_model, encode_tokens, model_forward
 
+if lm_eval_available:
+    try: # lm_eval version 0.4
+        from lm_eval.models.huggingface import HFLM as eval_wrapper
+        from lm_eval.tasks import get_task_dict
+        from lm_eval.evaluator import evaluate
+        lm_eval.tasks.initialize_tasks()
+    except: #lm_eval version 0.3
+        from lm_eval import base
+        from lm_eval import tasks
+        from lm_eval import evaluator
+        eval_wrapper=base.BaseLM
+        get_task_dict=tasks.get_task_dict
+        evaluate=evaluator.evaluate
+
 
 def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
     model: Transformer,
@@ -77,7 +83,7 @@ def setup_cache_padded_seq_input_pos_max_seq_length_for_prefill(
 
     return seq, input_pos, max_seq_length
 
-class GPTFastEvalWrapper(lm_eval.base.BaseLM):
+class GPTFastEvalWrapper(eval_wrapper):
     """
     A wrapper class for GPTFast, providing integration with the lm-evaluation-harness library.
     """
@@ -113,7 +119,7 @@ def batch_size(self):
     def device(self):
         return self._device
 
-    def tok_encode(self, string: str):
+    def tok_encode(self, string: str, **kwargs):
         encoded = encode_tokens(self._tokenizer,
             string, bos=True, device=self._device)
         # encoded is a pytorch tensor, but some internal logic in the
@@ -176,9 +182,9 @@ def eval(
     if 'hendrycks_test' in tasks:
         tasks.remove('hendrycks_test')
         tasks += [x for x in lm_eval.tasks.hendrycks_test.create_all_tasks().keys()]
-    task_dict = lm_eval.tasks.get_task_dict(tasks)
+    task_dict = get_task_dict(tasks)
 
-    eval_results = lm_eval.evaluator.evaluate(
+    eval_results = evaluate(
         model_eval_wrapper,
         task_dict,
         limit=limit,
diff --git a/quantize.py b/quantize.py
@@ -12,7 +12,8 @@
 from sentencepiece import SentencePieceProcessor
 
 try:
-    from GPTQ import GenericGPTQRunner, InputRecorder, lm_eval
+    from GPTQ import GenericGPTQRunner, InputRecorder
+    from eval import get_task_dict, evaluate
 except:
     pass
 
@@ -248,9 +249,9 @@ def get_inputs(model, tokenizer, calibration_tasks, calibration_limit, calibrati
             calibration_seq_length,
             pad_calibration_inputs,
         )
-        task_dict = lm_eval.tasks.get_task_dict(calibration_tasks)
+        task_dict = get_task_dict(calibration_tasks)
         print("Obtaining GPTQ calibration inputs on: ", calibration_tasks)
-        lm_eval.evaluator.evaluate(
+        evaluate(
             input_recorder,
             task_dict,
             limit=calibration_limit,