added logging

kashif · kashif · commit 717804264a9b · 2025-05-12T11:59:41.000+02:00
diff --git a/examples/research_projects/lpl/lpl_loss.py b/examples/research_projects/lpl/lpl_loss.py
@@ -61,6 +61,7 @@ def __init__(
         self.pow_law = pow_law
         self.norm_type = norm_type.lower()
         self.outlier_mask = remove_outliers
+        self.last_feature_stats = []  # Store feature statistics for logging
 
         assert feature_type in ["feature", "image"]
         self.feature_type = feature_type
@@ -132,15 +133,29 @@ def get_loss(self, input, target, get_hist=False):
             inp_f = self.get_features(self.shift + input / self.scale)
             tar_f = self.get_features(self.shift + target / self.scale, disable_grads=True)
             losses = []
+            self.last_feature_stats = []  # Reset feature stats
 
             for i, (x, y) in enumerate(zip(inp_f, tar_f, strict=False)):
                 my = torch.ones_like(y).bool()
+                outlier_ratio = 0.0
+                
                 if self.outlier_mask:
                     with torch.no_grad():
                         if i == 2:
                             my, y = remove_outliers(y, down_f=2)
+                            outlier_ratio = 1.0 - my.float().mean().item()
                         elif i in [3, 4, 5]:
                             my, y = remove_outliers(y, down_f=1)
+                            outlier_ratio = 1.0 - my.float().mean().item()
+
+                # Store feature statistics before normalization
+                with torch.no_grad():
+                    stats = {
+                        'mean': y.mean().item(),
+                        'std': y.std().item(),
+                        'outlier_ratio': outlier_ratio,
+                    }
+                    self.last_feature_stats.append(stats)
 
                 # normalize feature tensors
                 if self.norm_type == "default":
diff --git a/examples/research_projects/lpl/lpl_sdxl.py b/examples/research_projects/lpl/lpl_sdxl.py
@@ -1354,14 +1354,65 @@ def compute_time_ids(original_size, crops_coords_top_left):
                 progress_bar.update(1)
                 global_step += 1
 
+                # Enhanced logging for LPL metrics
                 log_data = {
                     "train_loss": train_loss,
                     "diffusion_loss": loss.item(),
+                    "learning_rate": lr_scheduler.get_last_lr()[0],
                 }
-                if args.use_lpl and lpl_loss_value.item() > 0:
-                    log_data["lpl_loss"] = lpl_loss_value.item()
+
+                if args.use_lpl and lpl_fn is not None and global_step >= args.lpl_start:
+                    if lpl_mask.any():
+                        # LPL application statistics
+                        log_data.update({
+                            "lpl/loss": lpl_loss_value.item(),
+                            "lpl/num_samples": lpl_mask.sum().item(),
+                            "lpl/application_ratio": lpl_mask.float().mean().item(),
+                            "lpl/weight": args.lpl_weight,
+                            "lpl/weighted_loss": (args.lpl_weight * lpl_loss_value).item(),
+                        })
+
+                        # SNR statistics for LPL-applied samples
+                        if args.snr_gamma is not None:
+                            snr_values = snr[masked_indices]
+                            log_data.update({
+                                "lpl/snr_mean": snr_values.mean().item(),
+                                "lpl/snr_std": snr_values.std().item(),
+                                "lpl/snr_min": snr_values.min().item(),
+                                "lpl/snr_max": snr_values.max().item(),
+                            })
+
+                        # Feature statistics if available
+                        if hasattr(lpl_fn, 'last_feature_stats'):
+                            for layer_idx, stats in enumerate(lpl_fn.last_feature_stats):
+                                log_data.update({
+                                    f"lpl/features/layer_{layer_idx}/mean": stats['mean'].item(),
+                                    f"lpl/features/layer_{layer_idx}/std": stats['std'].item(),
+                                    f"lpl/features/layer_{layer_idx}/outlier_ratio": stats.get('outlier_ratio', 0.0),
+                                })
+
+                        # Memory usage if available
+                        if torch.cuda.is_available():
+                            log_data.update({
+                                "lpl/memory/allocated": torch.cuda.memory_allocated() / 1024**2,  # MB
+                                "lpl/memory/reserved": torch.cuda.memory_reserved() / 1024**2,    # MB
+                            })
+
+                # Log to accelerator
                 accelerator.log(log_data, step=global_step)
 
+                # Update progress bar with more metrics
+                progress_bar_logs = {
+                    "loss": loss.detach().item(),
+                    "lr": lr_scheduler.get_last_lr()[0],
+                }
+                if args.use_lpl and lpl_loss_value.item() > 0:
+                    progress_bar_logs.update({
+                        "lpl": lpl_loss_value.item(),
+                        "lpl_ratio": lpl_mask.float().mean().item() if lpl_mask.any() else 0.0,
+                    })
+                progress_bar.set_postfix(**progress_bar_logs)
+
                 # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues.
                 if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process:
                     if global_step % args.checkpointing_steps == 0: