audio-diffusion-pytorch-trainer/exp/diff_mae_0.yaml at main · archinetai/audio-diffusion-pytorch-trainer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# @package _global_

# To execute this experiment on a single GPU run:
# python train.py exp=diff_mae_0 trainer.gpus=1 +datamodule.dataset.path=/your_wav_path_here

sampling_rate: 48000
length: 131072
channels: 2
log_every_n_steps: 1000

model:
  _target_: main.module_diff_mae.Model
  lr: 1e-4
  lr_beta1: 0.95
  lr_beta2: 0.999
  lr_eps: 1e-6
  lr_weight_decay: 1e-3
  ema_beta: 0.995
  ema_power: 0.7

  model:
    _target_: audio_diffusion_pytorch.DiffusionMAE1d
    in_channels: ${channels}
    channels: 512
    multipliers: [3, 2, 1, 1, 1, 1, 1, 1]
    factors: [1, 2, 2, 2, 2, 2, 2]
    num_blocks: [1, 1, 1, 2, 2, 2, 2]
    attentions: [0, 0, 0, 0, 0, 0, 0]
    encoder_inject_depth: 3
    encoder_channels: 32
    encoder_factors: [1, 1, 2, 2, 1, 1]
    encoder_multipliers: [32, 16, 8, 8, 4, 2, 1]
    encoder_num_blocks: [4, 4, 4, 4, 4, 4]
    bottleneck:
      - _target_: audio_encoders_pytorch.TanhBottleneck
    stft_use_complex: True
    stft_num_fft: 1023
    stft_hop_length: 256
    diffusion_type: v
    diffusion_sigma_distribution:
      _target_: audio_diffusion_pytorch.UniformDistribution


datamodule:
  _target_: main.module_diff_mae.Datamodule
  dataset:
    _target_: audio_data_pytorch.WAVDataset
    recursive: True
    sample_rate: ${sampling_rate}
    transforms:
      _target_: audio_data_pytorch.AllTransform
      random_crop_size: ${length}
      stereo: True
  val_split: 0.0001
  batch_size: 24
  num_workers: 8
  pin_memory: True

callbacks:
  rich_progress_bar:
    _target_: pytorch_lightning.callbacks.RichProgressBar

  model_checkpoint:
    _target_: pytorch_lightning.callbacks.ModelCheckpoint
    monitor: "valid_loss"   # name of the logged metric which determines when model is improving
    save_top_k: 1           # save k best models (determined by above metric)
    save_last: True         # additionaly always save model from last epoch
    mode: "min"             # can be "max" or "min"
    verbose: False
    dirpath: ${logs_dir}/ckpts/${now:%Y-%m-%d-%H-%M-%S}
    filename: '{epoch:02d}-{valid_loss:.3f}'

  model_summary:
    _target_: pytorch_lightning.callbacks.RichModelSummary
    max_depth: 2

  audio_samples_logger:
    _target_: main.module_diff_mae.SampleLogger
    num_items: 3
    channels: ${channels}
    sampling_rate: ${sampling_rate}
    sampling_steps: [3,5,10,25,50,100]
    use_ema_model: False
    diffusion_sampler:
      _target_: audio_diffusion_pytorch.VSampler
    diffusion_schedule:
      _target_: audio_diffusion_pytorch.LinearSchedule

loggers:
  wandb:
    _target_: pytorch_lightning.loggers.wandb.WandbLogger
    project: ${oc.env:WANDB_PROJECT}
    entity: ${oc.env:WANDB_ENTITY}
    # offline: False  # set True to store all logs only locally
    job_type: "train"
    group: ""
    save_dir: ${logs_dir}

trainer:
  _target_: pytorch_lightning.Trainer
  gpus: 0 # Set `1` to train on GPU, `0` to train on CPU only, and `-1` to train on all GPUs, default `0`
  precision: 32 # Precision used for tensors, default `32`
  accelerator: null # `ddp` GPUs train individually and sync gradients, default `None`
  min_epochs: 0
  max_epochs: -1
  enable_model_summary: False
  log_every_n_steps: 1 # Logs metrics every N batches
  check_val_every_n_epoch: null
  val_check_interval: ${log_every_n_steps}