LZHMS
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎base/base_config.py‎
Lines changed: 13 additions & 12 deletions b/‎base/base_config.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎base/base_trainer.py‎
Lines changed: 5 additions & 17 deletions b/‎base/base_trainer.py‎
Lines changed: 5 additions & 17 deletions
diff --git a/‎config/difftalk_trainer_config.yaml‎
Lines changed: 2 additions & 1 deletion b/‎config/difftalk_trainer_config.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎config/toy_trainer_config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config/toy_trainer_config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dataset/MINIST.py‎
Lines changed: 0 additions & 1 deletion b/‎dataset/MINIST.py‎
Lines changed: 0 additions & 1 deletion
@@ -23,4 +23,5 @@ FLAME/
 # ignore the data
 data/HDTF_TFHP
 data/MNIST
+data/VOCASET
 data/data_pipline/audio_visual_dataset/
@@ -102,12 +102,19 @@ def __init__(self):
     ###########################
     cfg.MODEL = CN()
     cfg.MODEL.NAME = ""
+    cfg.MODEL.INIT_WEIGHTS = ""   # Path to model weights (for initialization)
+    cfg.MODEL.AUDIO_MODEL = 'wav2vec2'
+    cfg.MODEL.AUDIO_DIM = 128
+
     cfg.MODEL.MLP = CN()
     cfg.MODEL.MLP.INPUT_DIM = 784
     cfg.MODEL.MLP.HIDDEN_DIM = [128, 64]
     cfg.MODEL.MLP.OUTPUT_DIM = 10
-    # Path to model weights (for initialization)
-    cfg.MODEL.INIT_WEIGHTS = ""
+
+    
+
+    
+    
     # Definition of embedding layers
     cfg.MODEL.HEAD = CN()
     # If none, do not construct embedding layers, the
@@ -119,16 +126,10 @@ def __init__(self):
     cfg.MODEL.HEAD.ACTIVATION = "relu"
     cfg.MODEL.HEAD.BN = True
     cfg.MODEL.HEAD.DROPOUT = 0.0
-    # VQ-VAE config
-    cfg.MODEL.HEAD.N_EMBED = 256
-    cfg.MODEL.HEAD.ZQUANT_DIM = 64
-    # Audio model
-    cfg.MODEL.HEAD.AUDIO_MODEL = 'wav2vec2'
-    cfg.MODEL.HEAD.AUDIO_DIM = 128
-    # Style ref
-    cfg.MODEL.HEAD.STYLE_DIM = 128
-    # Use indicator for padding frames
-    cfg.MODEL.HEAD.USE_INDICATOR = False
+
+    
+    
+    cfg.MODEL.HEAD.USE_INDICATOR = False  # Use indicator for padding frames
 
     # optional head type according to different input
     cfg.MODEL.HEAD.ROT_REPR = 'aa'
 
@@ -32,8 +32,6 @@
 def build_trainer(cfg):
     avai_trainers = TRAINER_REGISTRY.registered_names()
     check_availability(cfg.TRAINER.NAME, avai_trainers)
-    if cfg.ENV.VERBOSE:
-        print("Loading trainer: {}".format(cfg.TRAINER.NAME))
     return TRAINER_REGISTRY.get(cfg.TRAINER.NAME)(cfg)
 
 class TrainerBase:
@@ -98,21 +96,17 @@ def system_init(self):
         ## cuda setting
         if torch.cuda.is_available() and self.cfg.ENV.USE_CUDA:
           torch.backends.cudnn.benchmark = True
-          gpu_ids = self.cfg.ENV.GPU
-          if not gpu_ids:
-            raise ValueError("ENV.GPU must contain at least one gpu id when USE_CUDA=True")
-
           if self.is_distributed:
             # In distributed mode, use local_rank to determine GPU
-            target_gpu = gpu_ids[self.local_rank % len(gpu_ids)]
+            target_gpu = self.cfg.ENV.GPU[self.local_rank % len(self.cfg.ENV.GPU)]
           else:
-            target_gpu = gpu_ids[0]
-            if len(gpu_ids) > 1 and torch.distributed.is_available():
+            target_gpu = self.cfg.ENV.GPU[0]
+            if len(self.cfg.ENV.GPU) > 1 and torch.distributed.is_available():
               # assume torchrun/launch supplies LOCAL_RANK; fallback to rank % len(gpu_ids)
               local_rank = int(os.environ.get("LOCAL_RANK", 0))
               if torch.distributed.is_initialized():
-                local_rank = torch.distributed.get_rank() % len(gpu_ids)
-              target_gpu = gpu_ids[local_rank % len(gpu_ids)]
+                local_rank = torch.distributed.get_rank() % len(self.cfg.ENV.GPU)
+              target_gpu = self.cfg.ENV.GPU[local_rank % len(self.cfg.ENV.GPU)]
 
           self.device = torch.device(f"cuda:{target_gpu}")
           torch.cuda.set_device(self.device)
@@ -126,17 +120,11 @@ def _init_distributed(self):
         # Get local rank from environment variable (set by torchrun)
         self.local_rank = int(os.environ.get('LOCAL_RANK', -1))
 
-        if self.local_rank == -1:
-            print("LOCAL_RANK not found in environment. Falling back to non-distributed mode.")
-            self.cfg.ENV.DISTRIBUTED = False
-            return
-        
         # Initialize process group
         dist.init_process_group(
             backend=self.cfg.ENV.DIST_BACKEND,
             init_method=self.cfg.ENV.DIST_URL
         )
-        
         self.rank = dist.get_rank()
         self.world_size = dist.get_world_size()
         self.is_distributed = True
 
@@ -14,7 +14,8 @@ ENV:
     TAGS: [Baseline]
     MODE: online
   EXTRA:
-    STYLE_ENC_CKPT: 
+    STYLE_DIM: 128
+    STYLE_ENC_CKPT:
 
 DATASET:
   NAME: HDTF_TFHP
 
@@ -3,7 +3,7 @@
 ENV:
   SEED: 42
   OUTPUT_DIR: ./output
-  GPU: [0]  # Multi-GPU training
+  GPU: [0, 1]  # List format - will be parsed as a list
   USE_CUDA: True
   VERBOSE: True
 
 
@@ -1,7 +1,6 @@
 """
 MNIST Dataset for Handwritten Digit Recognition
 """
-import os
 from torchvision import datasets, transforms
 
 from base.base_dataset import Datum, DatasetBase, DATASET_REGISTRY