simplify logging during caffe2 model loading

ppwwyyxx · facebook-github-bot · commit c90ff5ea6216 · 2021-02-19T01:41:49.000-08:00
Summary: Group similar weight names together. Produce ~5x fewer lines

Reviewed By: theschnitz

Differential Revision: D26421225

fbshipit-source-id: 0696af09fe18d0faa47d8c1d1e6f8d26081dee41
diff --git a/detectron2/checkpoint/c2_model_loading.py b/detectron2/checkpoint/c2_model_loading.py
@@ -2,11 +2,9 @@
 import copy
 import logging
 import re
+from typing import Dict, List
 import torch
-from fvcore.common.checkpoint import (
-    get_missing_parameters_message,
-    get_unexpected_parameters_message,
-)
+from tabulate import tabulate
 
 
 def convert_basic_c2_names(original_keys):
@@ -77,7 +75,7 @@ def convert_c2_detectron_names(weights):
         dict: detectron2 names -> C2 names
     """
     logger = logging.getLogger(__name__)
-    logger.info("Remapping C2 weights ......")
+    logger.info("Renaming Caffe2 weights ......")
     original_keys = sorted(weights.keys())
     layer_keys = copy.deepcopy(original_keys)
 
@@ -210,8 +208,9 @@ def fpn_map(name):
 # it assumes model_state_dict will have longer names.
 def align_and_update_state_dicts(model_state_dict, ckpt_state_dict, c2_conversion=True):
     """
-    Match names between the two state-dict, and update the values of model_state_dict in-place with
-    copies of the matched tensor in ckpt_state_dict.
+    Match names between the two state-dict, and returns a new chkpt_state_dict with names
+    converted to match model_state_dict with heuristics. The returned dict can be later
+    loaded with fvcore checkpointer.
     If `c2_conversion==True`, `ckpt_state_dict` is assumed to be a Caffe2
     model and will be renamed at first.
 
@@ -251,13 +250,10 @@ def match(a, b):
     # remove indices that correspond to no-match
     idxs[max_match_size == 0] = -1
 
-    # used for logging
-    max_len_model = max(len(key) for key in model_keys) if model_keys else 1
-    max_len_ckpt = max(len(key) for key in ckpt_keys) if ckpt_keys else 1
-    log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
     logger = logging.getLogger(__name__)
     # matched_pairs (matched checkpoint key --> matched model key)
     matched_keys = {}
+    result_state_dict = {}
     for idx_model, idx_ckpt in enumerate(idxs.tolist()):
         if idx_ckpt == -1:
             continue
@@ -279,7 +275,8 @@ def match(a, b):
             )
             continue
 
-        model_state_dict[key_model] = value_ckpt.clone()
+        assert key_model not in result_state_dict
+        result_state_dict[key_model] = value_ckpt
         if key_ckpt in matched_keys:  # already added to matched_keys
             logger.error(
                 "Ambiguity found for {} in checkpoint!"
@@ -290,24 +287,118 @@ def match(a, b):
             raise ValueError("Cannot match one checkpoint key to multiple keys in the model.")
 
         matched_keys[key_ckpt] = key_model
-        logger.info(
-            log_str_template.format(
-                key_model,
-                max_len_model,
-                original_keys[key_ckpt],
-                max_len_ckpt,
-                tuple(shape_in_model),
+
+    # logging:
+    matched_model_keys = sorted(matched_keys.values())
+    common_prefix = _longest_common_prefix(matched_model_keys)
+    rev_matched_keys = {v: k for k, v in matched_keys.items()}
+    original_keys = {k: original_keys[rev_matched_keys[k]] for k in matched_model_keys}
+
+    model_key_groups = _group_keys_by_module(matched_model_keys, original_keys)
+    table = []
+    memo = set()
+    for key_model in matched_model_keys:
+        if key_model in memo:
+            continue
+        if key_model in model_key_groups:
+            group = model_key_groups[key_model]
+            memo |= set(group)
+            shapes = [tuple(model_state_dict[k].shape) for k in group]
+            table.append(
+                (
+                    _longest_common_prefix([k[len(common_prefix) :] for k in group]) + "*",
+                    _group_str([original_keys[k] for k in group]),
+                    " ".join([str(x).replace(" ", "") for x in shapes]),
+                )
             )
-        )
-    matched_model_keys = matched_keys.values()
-    matched_ckpt_keys = matched_keys.keys()
-    # print warnings about unmatched keys on both side
-    unmatched_model_keys = [k for k in model_keys if k not in matched_model_keys]
-    if len(unmatched_model_keys):
-        logger.info(get_missing_parameters_message(unmatched_model_keys))
-
-    unmatched_ckpt_keys = [k for k in ckpt_keys if k not in matched_ckpt_keys]
-    if len(unmatched_ckpt_keys):
-        logger.info(
-            get_unexpected_parameters_message(original_keys[x] for x in unmatched_ckpt_keys)
-        )
+        else:
+            key_checkpoint = original_keys[key_model]
+            shape = str(tuple(model_state_dict[key_model].shape))
+            table.append((key_model[len(common_prefix) :], key_checkpoint, shape))
+    table_str = tabulate(
+        table, tablefmt="pipe", headers=["Names in Model", "Names in Checkpoint", "Shapes"]
+    )
+    logger.info(
+        "Following weights matched with "
+        + (f"submodule {common_prefix[:-1]}" if common_prefix else "model")
+        + ":\n"
+        + table_str
+    )
+
+    unmatched_ckpt_keys = [k for k in ckpt_keys if k not in set(matched_keys.keys())]
+    for k in unmatched_ckpt_keys:
+        result_state_dict[k] = ckpt_state_dict[k]
+    return result_state_dict
+
+
+def _group_keys_by_module(keys: List[str], original_names: Dict[str, str]):
+    """
+    Params in the same submodule are grouped together.
+
+    Args:
+        keys: names of all parameters
+        original_names: mapping from parameter name to their name in the checkpoint
+
+    Returns:
+        dict[name -> all other names in the same group]
+    """
+
+    def _submodule_name(key):
+        pos = key.rfind(".")
+        if pos < 0:
+            return None
+        prefix = key[: pos + 1]
+        return prefix
+
+    all_submodules = [_submodule_name(k) for k in keys]
+    all_submodules = [x for x in all_submodules if x]
+    all_submodules = sorted(all_submodules, key=len)
+
+    ret = {}
+    for prefix in all_submodules:
+        group = [k for k in keys if k.startswith(prefix)]
+        if len(group) <= 1:
+            continue
+        original_name_lcp = _longest_common_prefix_str([original_names[k] for k in group])
+        if len(original_name_lcp) == 0:
+            # don't group weights if original names don't share prefix
+            continue
+
+        for k in group:
+            if k in ret:
+                continue
+            ret[k] = group
+    return ret
+
+
+def _longest_common_prefix(names: List[str]) -> str:
+    """
+    ["abc.zfg", "abc.zef"] -> "abc."
+    """
+    names = [n.split(".") for n in names]
+    m1, m2 = min(names), max(names)
+    ret = [a for a, b in zip(m1, m2) if a == b]
+    ret = ".".join(ret) + "." if len(ret) else ""
+    return ret
+
+
+def _longest_common_prefix_str(names: List[str]) -> str:
+    m1, m2 = min(names), max(names)
+    lcp = [a for a, b in zip(m1, m2) if a == b]
+    lcp = "".join(lcp)
+    return lcp
+
+
+def _group_str(names: List[str]) -> str:
+    """
+    Turn "common1", "common2", "common3" into "common{1,2,3}"
+    """
+    lcp = _longest_common_prefix_str(names)
+    rest = [x[len(lcp) :] for x in names]
+    rest = "{" + ",".join(rest) + "}"
+    ret = lcp + rest
+
+    # add some simplification for BN specifically
+    ret = ret.replace("bn_{beta,running_mean,running_var,gamma}", "bn_*")
+    ret = ret.replace("bn_beta,bn_running_mean,bn_running_var,bn_gamma", "bn_*")
+    return ret
diff --git a/detectron2/checkpoint/detection_checkpoint.py b/detectron2/checkpoint/detection_checkpoint.py
@@ -60,17 +60,13 @@ def _load_model(self, checkpoint):
         if checkpoint.get("matching_heuristics", False):
             self._convert_ndarray_to_tensor(checkpoint["model"])
             # convert weights by name-matching heuristics
-            model_state_dict = self.model.state_dict()
-            align_and_update_state_dicts(
-                model_state_dict,
+            checkpoint["model"] = align_and_update_state_dicts(
+                self.model.state_dict(),
                 checkpoint["model"],
                 c2_conversion=checkpoint.get("__author__", None) == "Caffe2",
             )
-            checkpoint["model"] = model_state_dict
         # for non-caffe2 models, use standard ways to load it
         incompatible = super()._load_model(checkpoint)
-        if incompatible is None:  # support older versions of fvcore
-            return None
 
         model_buffers = dict(self.model.named_buffers(recurse=False))
         for k in ["pixel_mean", "pixel_std"]:
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
@@ -36,7 +36,8 @@ def test_complex_model_loaded(self):
                 model = nn.DataParallel(model)
             model_sd = model.state_dict()
 
-            align_and_update_state_dicts(model_sd, state_dict)
+            sd_to_load = align_and_update_state_dicts(model_sd, state_dict)
+            model.load_state_dict(sd_to_load)
             for loaded, stored in zip(model_sd.values(), state_dict.values()):
                 # different tensor references
                 self.assertFalse(id(loaded) == id(stored))