fix(train): missing ckpt var

fumiama · fumiama · commit 27afc3981f1a · 2024-11-28T03:14:03.000+09:00
diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py
@@ -16,62 +16,12 @@
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 logger = logging
 
-"""
-def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
-    assert os.path.isfile(checkpoint_path)
-    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
-
-    ##################
-    def go(model, bkey):
-        saved_state_dict = checkpoint_dict[bkey]
-        if hasattr(model, "module"):
-            state_dict = model.module.state_dict()
-        else:
-            state_dict = model.state_dict()
-        new_state_dict = {}
-        for k, v in state_dict.items():  # 模型需要的shape
-            try:
-                new_state_dict[k] = saved_state_dict[k]
-                if saved_state_dict[k].shape != state_dict[k].shape:
-                    logger.warning(
-                        "shape-%s-mismatch. need: %s, get: %s",
-                        k,
-                        state_dict[k].shape,
-                        saved_state_dict[k].shape,
-                    )  #
-                    raise KeyError
-            except:
-                # logger.info(traceback.format_exc())
-                logger.info("%s is not in the checkpoint", k)  # pretrain缺失的
-                new_state_dict[k] = v  # 模型自带的随机值
-        if hasattr(model, "module"):
-            model.module.load_state_dict(new_state_dict, strict=False)
-        else:
-            model.load_state_dict(new_state_dict, strict=False)
-        return model
-
-    go(combd, "combd")
-    model = go(sbd, "sbd")
-    #############
-    logger.info("Loaded model weights")
-
-    iteration = checkpoint_dict["iteration"]
-    learning_rate = checkpoint_dict["learning_rate"]
-    if (
-        optimizer is not None and load_opt == 1
-    ):  ###加载不了，如果是空的的话，重新初始化，可能还会影响lr时间表的更新，因此在train文件最外围catch
-        #   try:
-        optimizer.load_state_dict(checkpoint_dict["optimizer"])
-    #   except:
-    #     traceback.print_exc()
-    logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
-    return model, optimizer, learning_rate, iteration
-"""
-
 
 def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
     assert os.path.isfile(checkpoint_path)
-    saved_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
+    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+
+    saved_state_dict = checkpoint_dict["model"]
     if hasattr(model, "module"):
         state_dict = model.module.state_dict()
     else:
@@ -132,34 +82,6 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
     )
 
 
-"""
-def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
-    logger.info(
-        "Saving model and optimizer state at epoch {} to {}".format(
-            iteration, checkpoint_path
-        )
-    )
-    if hasattr(combd, "module"):
-        state_dict_combd = combd.module.state_dict()
-    else:
-        state_dict_combd = combd.state_dict()
-    if hasattr(sbd, "module"):
-        state_dict_sbd = sbd.module.state_dict()
-    else:
-        state_dict_sbd = sbd.state_dict()
-    torch.save(
-        {
-            "combd": state_dict_combd,
-            "sbd": state_dict_sbd,
-            "iteration": iteration,
-            "optimizer": optimizer.state_dict(),
-            "learning_rate": learning_rate,
-        },
-        checkpoint_path,
-    )
-"""
-
-
 def summarize(
     writer,
     global_step,
@@ -366,53 +288,6 @@ def get_hparams(init=True):
     return hparams
 
 
-"""
-def get_hparams_from_dir(model_dir):
-    config_save_path = os.path.join(model_dir, "config.json")
-    with open(config_save_path, "r") as f:
-        data = f.read()
-    config = json.loads(data)
-
-    hparams = HParams(**config)
-    hparams.model_dir = model_dir
-    return hparams
-
-
-def get_hparams_from_file(config_path):
-    with open(config_path, "r") as f:
-        data = f.read()
-    config = json.loads(data)
-
-    hparams = HParams(**config)
-    return hparams
-
-
-def check_git_hash(model_dir):
-    source_dir = os.path.dirname(os.path.realpath(__file__))
-    if not os.path.exists(os.path.join(source_dir, ".git")):
-        logger.warning(
-            "{} is not a git repository, therefore hash value comparison will be ignored.".format(
-                source_dir
-            )
-        )
-        return
-
-    cur_hash = subprocess.getoutput("git rev-parse HEAD")
-
-    path = os.path.join(model_dir, "githash")
-    if os.path.exists(path):
-        saved_hash = open(path).read()
-        if saved_hash != cur_hash:
-            logger.warning(
-                "git hash values are different. {}(saved) != {}(current)".format(
-                    saved_hash[:8], cur_hash[:8]
-                )
-            )
-    else:
-        open(path, "w").write(cur_hash)
-"""
-
-
 def get_logger(model_dir, filename="train.log"):
     global logger
     logger = logging.getLogger(os.path.basename(model_dir))