@@ -22,10 +22,10 @@ https://colab.research.google.com/gist/flavioschneider/39c6454bfc2d03dc7d0c5c9d8
2222``` py
2323from audio_diffusion_pytorch import AudioDiffusionModel
2424
25- model = AudioDiffusionModel()
25+ model = AudioDiffusionModel(in_channels = 1 )
2626
2727# Train model with audio sources
28- x = torch.randn(2 , 1 , 2 ** 18 ) # [batch, channels , samples], 2**18 ≈ 12s of audio at a frequency of 22050
28+ x = torch.randn(2 , 1 , 2 ** 18 ) # [batch, in_channels , samples], 2**18 ≈ 12s of audio at a frequency of 22050Hz
2929loss = model(x)
3030loss.backward() # Do this many times
3131
@@ -46,22 +46,21 @@ from audio_diffusion_pytorch import UNet1d
4646# UNet used to denoise our 1D (audio) data
4747unet = UNet1d(
4848 in_channels = 1 ,
49- patch_size = 16 ,
5049 channels = 128 ,
50+ patch_size = 16 ,
51+ kernel_sizes_init = [1 , 3 , 7 ],
5152 multipliers = [1 , 2 , 4 , 4 , 4 , 4 , 4 ],
5253 factors = [4 , 4 , 4 , 2 , 2 , 2 ],
5354 attentions = [False , False , False , True , True , True ],
5455 num_blocks = [2 , 2 , 2 , 2 , 2 , 2 ],
5556 attention_heads = 8 ,
5657 attention_features = 64 ,
5758 attention_multiplier = 2 ,
59+ use_attention_bottleneck = True ,
5860 resnet_groups = 8 ,
5961 kernel_multiplier_downsample = 2 ,
60- kernel_sizes_init = [1 , 3 , 7 ],
6162 use_nearest_upsample = False ,
6263 use_skip_scale = True ,
63- use_attention_bottleneck = True ,
64- use_learned_time_embedding = True ,
6564)
6665
6766x = torch.randn(3 , 1 , 2 ** 16 )
0 commit comments