[Bug fixes] fix load-torch with different prefix key (#4141)

wj-Mcat · sijunhe · web-flow · commit 706971bfda2d · 2022-12-26T10:56:18.000+08:00
* fix load-torch scripts

* fix tensor reading

* add serialization error for seek_by_string

Co-authored-by: Sijun He &lt;sijun.he@hotmail.com&gt;
diff --git a/paddlenlp/utils/serialization.py b/paddlenlp/utils/serialization.py
@@ -15,10 +15,12 @@
 from __future__ import annotations
 
 import io
+import os
 import pickle
 from functools import lru_cache
 
 import numpy as np
+from _io import BufferedReader
 
 MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
 
@@ -38,6 +40,12 @@ def __repr__(self):
         return f"size: {self.size} key: {self.key}, nbytes: {self.nbytes}, dtype: {self.dtype}"
 
 
+class SerializationError(Exception):
+    """Exception for serialization"""
+
+    pass
+
+
 @lru_cache(maxsize=None)
 def _storage_type_to_dtype_to_map():
     """convert storage type to numpy dtype"""
@@ -123,6 +131,52 @@ def dumpy(*args, **kwarsg):
     return None
 
 
+def seek_by_string(file_handler: BufferedReader, string: str, file_size: int) -> int:
+    """seek the index of file-handler with target words
+
+    Args:
+        file_handler (BufferedReader): file handler
+        string (str): the specific string in the file
+        file_size (int): size of file
+
+    Returns:
+        int: end index of target string
+    """
+    word_index = 0
+    word_bytes = string.encode("latin")
+    empty_byte = "".encode("latin")
+
+    while word_index < len(string) and file_handler.tell() < file_size:
+        content = file_handler.read(1)
+        if content == empty_byte:
+            break
+
+        if word_bytes[word_index] == content[0]:
+            word_index += 1
+        else:
+            word_index = 0
+
+    if file_handler.tell() >= file_size - 1:
+        raise SerializationError(f"can't find the find the target string<{string}> in the file")
+    return file_handler.tell()
+
+
+def read_prefix_key(file_handler: BufferedReader, file_size: int):
+    """read the prefix key in model weight file, eg: archive/pytorch_model
+
+    Args:
+        file_handler (BufferedReader): file handler
+        fiel_size (_type_): size of file
+
+    Returns:
+        _type_: _description_
+    """
+    end_index = seek_by_string(file_handler, "data.pkl", file_size)
+    file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+    prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE - len("/data.pkl"))
+    return prefix_key
+
+
 def load_torch(path: str, **pickle_load_args):
     """
     load torch weight file with the following steps:
@@ -142,8 +196,6 @@ def load_torch(path: str, **pickle_load_args):
     # 1. load the structure of pytorch weight file
     def persistent_load_stage1(saved_id):
         assert isinstance(saved_id, tuple)
-        print(saved_id)
-
         data = saved_id[1:]
         storage_type, key, _, numel = data
         dtype = storage_type.dtype
@@ -173,21 +225,20 @@ def extract_maybe_dict(result):
     metadata = sorted(metadata, key=lambda x: x.key)
     # 3. parse the tensor of pytorch weight file
     stage1_key_to_tensor = {}
+    content_size = os.stat(path).st_size
     with open(path, "rb") as file_handler:
+        prefix_key = read_prefix_key(file_handler, content_size).decode("latin")
         file_handler.seek(pre_offset)
+
         for tensor_meta in metadata:
             key = tensor_meta.key
             # eg: archive/data/1FB
-            filename_with_fb = len(f"archive/data/{key}") + 2
-
-            # skip the fix position to read tensor data
-            # `MZ_ZIP_LOCAL_DIR_HEADER_SIZE` is from: https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/inline_container.cc#L186
-            # `16` is the fixed characters size from binary file.
-            # `filename_with_fb` is the length of dynamic data key name
-            file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + 16 + filename_with_fb, 1)
+            filename = f"{prefix_key}/data/{key}"
+            seek_by_string(file_handler, filename, content_size)
+            file_handler.seek(2, 1)
 
             padding_offset = np.frombuffer(file_handler.read(2)[:1], dtype=np.uint8)[0]
-            file_handler.read(padding_offset)
+            file_handler.seek(padding_offset, 1)
 
             # save the tensor info in result to re-use memory
             stage1_key_to_tensor[key] = np.frombuffer(
diff --git a/tests/utils/test_serialization.py b/tests/utils/test_serialization.py
@@ -21,7 +21,7 @@
 from parameterized import parameterized
 
 from paddlenlp.utils import load_torch
-from tests.testing_utils import require_package, slow
+from tests.testing_utils import require_package
 
 
 class SerializationTest(TestCase):
@@ -54,14 +54,20 @@ def test_simple_load(self, dtype: str):
                     torch_data[key].numpy(),
                 )
 
-    @slow
     @require_package("torch")
-    def test_load_bert_model(self):
+    @parameterized.expand(
+        [
+            "hf-internal-testing/tiny-random-codegen",
+            "hf-internal-testing/tiny-random-Data2VecTextModel",
+            "hf-internal-testing/tiny-random-SwinModel",
+        ]
+    )
+    def test_load_bert_model(self, repo_id):
         import torch
 
         with tempfile.TemporaryDirectory() as tempdir:
             weight_file = hf_hub_download(
-                repo_id="hf-internal-testing/tiny-random-codegen",
+                repo_id=repo_id,
                 filename="pytorch_model.bin",
                 cache_dir=tempdir,
                 library_name="PaddleNLP",